Index: include/llvm/ADT/PostOrderIterator.h =================================================================== --- include/llvm/ADT/PostOrderIterator.h +++ include/llvm/ADT/PostOrderIterator.h @@ -296,12 +296,15 @@ public: using rpo_iterator = typename std::vector::reverse_iterator; + using const_rpo_iterator = typename std::vector::const_reverse_iterator; ReversePostOrderTraversal(GraphT G) { Initialize(GT::getEntryNode(G)); } // Because we want a reverse post order, use reverse iterators from the vector rpo_iterator begin() { return Blocks.rbegin(); } + const_rpo_iterator begin() const { return Blocks.crbegin(); } rpo_iterator end() { return Blocks.rend(); } + const_rpo_iterator end() const { return Blocks.crend(); } }; } // end namespace llvm Index: include/llvm/Analysis/DivergenceAnalysis.h =================================================================== --- include/llvm/Analysis/DivergenceAnalysis.h +++ include/llvm/Analysis/DivergenceAnalysis.h @@ -7,55 +7,239 @@ // //===----------------------------------------------------------------------===// // -// The divergence analysis is an LLVM pass which can be used to find out -// if a branch instruction in a GPU program is divergent or not. It can help -// branch optimizations such as jump threading and loop unswitching to make -// better decisions. +// \file +// The divergence analysis determines which instructions and branches are +// divergent given a set of divergent source instructions. // //===----------------------------------------------------------------------===// + #ifndef LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H #define LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H #include "llvm/ADT/DenseSet.h" +#include "llvm/Analysis/SyncDependenceAnalysis.h" #include "llvm/IR/Function.h" #include "llvm/Pass.h" +#include namespace llvm { +class Module; class Value; -class DivergenceAnalysis : public FunctionPass { +class Instruction; +class Loop; +class raw_ostream; +class TargetTransformInfo; + +/// \brief Generic divergence analysis for reducible CFGs. +/// +/// This analysis propagates divergence in a data-parallel context from sources +/// of divergence to all users. It requires reducible CFGs. All assignments +/// should be in SSA form. +class DivergenceAnalysis { +public: + /// \brief This instance will analyze the whole function \p or the loop \p + /// RegionLoop. + /// + /// \param RegionLoop If non-null the analysis is restricted to \p RegionLoop. + /// Otherwise the whole function is analyzed. + /// \param IsLCSSAFOrm whether the analysis may assume that the IR in the + /// region in in LCSSA form. + DivergenceAnalysis(const Function &F, const Loop *RegionLoop, + const DominatorTree &DT, const LoopInfo &LI, + SyncDependenceAnalysis &SDA, bool IsLCSSAForm); + + /// \brief the loop that defines the analyzed region (if any). + const Loop *getRegionLoop() const { return RegionLoop; } + const Function &getFunction() const { return F; } + + /// \brief Whether \p BB is part of the region. + bool inRegion(const BasicBlock &BB) const; + /// \brief Whether \p I is part of the region. + bool inRegion(const Instruction &I) const; + + /// \brief Mark \p UniVal as a value that is always uniform. + void addUniformOverride(const Value &UniVal); + + /// \brief Mark \p DivVal as a value that is always divergent. + void markDivergent(const Value &DivVal); + + /// \brief Propagate divergence to all instructions in the region. + /// Divergence is seeded by calls to \p markDivergent. + void compute(); + + /// \brief Whether any value was marked or analyzed to be divergent. + bool hasDetectedDivergence() const { return !DivergentValues.empty(); } + + /// \brief Whether \p Val will always return a uniform value regardless of its + /// operands + bool isAlwaysUniform(const Value &Val) const; + + /// \brief Whether \p Val is a divergent value + bool isDivergent(const Value &Val) const; + + void print(raw_ostream &OS, const Module *) const; + +private: + bool updateTerminator(const TerminatorInst &Term) const; + bool updatePHINode(const PHINode &Phi) const; + + /// \returns whether \p Inst should be considered divergent based on the + /// divergence of its operands. + /// + /// This should only be called for non-phi, non-terminator instructions. + bool updateNormalInstruction(const Instruction &Inst) const; + + /// \brief mark users of live-out users as divergent. + /// + /// \param LoopHeader the header of the divergent loop. + /// + /// marks all users of live-out values of the loop headed by @loopHeader + /// as divergent and puts them on the worklist. + void taintLoopLiveOuts(const BasicBlock &LoopHeader); + + /// \brief Push all users of \p Val (in the region) to the worklist + void pushUsers(const Value &I); + + /// \brief Push all phi nodes in @block to the worklist + void pushPHINodes(const BasicBlock &Block); + + /// \brief Mark \p Block as join divergent + /// + /// A block is join divergent if two threads may reach it from different + /// incoming blocks at the same time. + void markBlockJoinDivergent(const BasicBlock &Block) { + DivergentJoinBlocks.insert(&Block); + } + + /// \brief Whether \p Val is divergent when read in \p ObservingBlock. + bool isTemporalDivergent(const BasicBlock &ObservingBlock, + const Value &Val) const; + + /// \brief Whether \p Block is join divergent + /// + /// (see markBlockJoinDivergent). + bool isJoinDivergent(const BasicBlock &Block) const { + return DivergentJoinBlocks.count(&Block); + } + + /// \brief Propagate control-induced divergence to users (phi nodes and + /// instructions). + // + // \param JoinBlock is a divergent loop exit or join point of two disjoint + // paths. + // \returns Whether \p JoinBlock is a divergent loop exit of \p TermLoop. + bool propagateJoinDivergence(const BasicBlock &JoinBlock, + const Loop *TermLoop); + + /// \brief Propagate induced value divergence due to control divergence in \p + /// Term. + void propagateBranchDivergence(const TerminatorInst &Term); + + /// \brief Propagate divergent caused by a divergent loop exit. + /// + /// \param ExitingLoop is a divergent loop. + void propagateLoopDivergence(const Loop &ExitingLoop); + +private: + const Function &F; + // If regionLoop != nullptr, analysis is only performed within \p RegionLoop. + // Otw, analyze the whole function + const Loop *RegionLoop; + + const DominatorTree &DT; + const LoopInfo &LI; + + // Recognized divergent loops + DenseSet DivergentLoops; + + // The SDA links divergent branches to divergent control-flow joins. + SyncDependenceAnalysis &SDA; + + // Use simplified code path for LCSSA form. + bool IsLCSSAForm; + + // Set of known-uniform values. + DenseSet UniformOverrides; + + // Blocks with joining divergent control from different predecessors. + DenseSet DivergentJoinBlocks; + + // Detected/marked divergent values. + DenseSet DivergentValues; + + // Internal worklist for divergence propagation. + std::vector Worklist; +}; + +/// \brief Divergence analysis frontend for loops. +class LoopDivergenceAnalysis { +public: + LoopDivergenceAnalysis(const DominatorTree &DT, const LoopInfo &LI, + SyncDependenceAnalysis &SDA, const Loop &loop); + + /// Whether \p V is divergent. + bool isDivergent(const Value &V) const; + + /// Whether \p V is uniform/non-divergent. + bool isUniform(const Value &V) const { return !isDivergent(V); } + + /// Print all divergent values in the loop. + void print(raw_ostream &OS, const Module *) const; + +private: + DivergenceAnalysis DA; +}; + +/// \brief Loop divergence printer pass. +/// This is intended for use in LIT testing. +class LoopDivergencePrinter : public FunctionPass { public: static char ID; - DivergenceAnalysis() : FunctionPass(ID) { - initializeDivergenceAnalysisPass(*PassRegistry::getPassRegistry()); + LoopDivergencePrinter() : FunctionPass(ID) { + initializeLoopDivergencePrinterPass(*PassRegistry::getPassRegistry()); } void getAnalysisUsage(AnalysisUsage &AU) const override; + /// Analyze all loop-divergence of all loops in @F and print the results. bool runOnFunction(Function &F) override; - // Print all divergent branches in the function. + /// Print all divergent values in the function. void print(raw_ostream &OS, const Module *) const override; - // Returns true if V is divergent at its definition. - // - // Even if this function returns false, V may still be divergent when used - // in a different basic block. - bool isDivergent(const Value *V) const { return DivergentValues.count(V); } +private: + std::unique_ptr SDA; + SmallVector, 6> LoopDivInfo; +}; - // Returns true if V is uniform/non-divergent. - // - // Even if this function returns true, V may still be divergent when used - // in a different basic block. - bool isUniform(const Value *V) const { return !isDivergent(V); } +// divergence analysis frontend for GPU kernels +class GPUDivergenceAnalysis { + SyncDependenceAnalysis SDA; + DivergenceAnalysis DA; - // Keep the analysis results uptodate by removing an erased value. - void removeValue(const Value *V) { DivergentValues.erase(V); } +public: + // runs the divergence analysis on @F, a GPU kernel + GPUDivergenceAnalysis(Function &F, const DominatorTree &DT, + const PostDominatorTree &PDT, const LoopInfo &LI, + const TargetTransformInfo &TTI); -private: - // Stores all divergent values. - DenseSet DivergentValues; + // whether any divergence was detected in @F + bool hasDivergence() const { return DA.hasDetectedDivergence(); } + + // the GPU kernel this analysis result is for + const Function &getFunction() const { return DA.getFunction(); } + + // whether @V is divergent + bool isDivergent(const Value &val) const; + + // whether @V is uniform/non-divergent + bool isUniform(const Value &val) const { return !isDivergent(val); } + + // print all divergent values in the kernel F + void print(raw_ostream &OS, const Module *) const; }; -} // End llvm namespace -#endif //LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H \ No newline at end of file +} // namespace llvm + +#endif // LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H Index: include/llvm/Analysis/LegacyDivergenceAnalysis.h =================================================================== --- include/llvm/Analysis/LegacyDivergenceAnalysis.h +++ include/llvm/Analysis/LegacyDivergenceAnalysis.h @@ -1,4 +1,4 @@ -//===- llvm/Analysis/DivergenceAnalysis.h - Divergence Analysis -*- C++ -*-===// +//===- llvm/Analysis/LegacyDivergenceAnalysis.h - KernelDivergence Analysis -*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -7,27 +7,29 @@ // //===----------------------------------------------------------------------===// // -// The divergence analysis is an LLVM pass which can be used to find out -// if a branch instruction in a GPU program is divergent or not. It can help +// The kernel divergence analysis is an LLVM pass which can be used to find out +// if a branch instruction in a GPU program (kernel) is divergent or not. It can help // branch optimizations such as jump threading and loop unswitching to make // better decisions. // //===----------------------------------------------------------------------===// -#ifndef LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H -#define LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H +#ifndef LLVM_ANALYSIS_LEGACY_DIVERGENCE_ANALYSIS_H +#define LLVM_ANALYSIS_LEGACY_DIVERGENCE_ANALYSIS_H #include "llvm/ADT/DenseSet.h" #include "llvm/IR/Function.h" #include "llvm/Pass.h" +#include "llvm/Analysis/DivergenceAnalysis.h" namespace llvm { class Value; -class DivergenceAnalysis : public FunctionPass { +class GPUDivergenceAnalysis; +class LegacyDivergenceAnalysis : public FunctionPass { public: static char ID; - DivergenceAnalysis() : FunctionPass(ID) { - initializeDivergenceAnalysisPass(*PassRegistry::getPassRegistry()); + LegacyDivergenceAnalysis() : FunctionPass(ID) { + initializeLegacyDivergenceAnalysisPass(*PassRegistry::getPassRegistry()); } void getAnalysisUsage(AnalysisUsage &AU) const override; @@ -41,7 +43,7 @@ // // Even if this function returns false, V may still be divergent when used // in a different basic block. - bool isDivergent(const Value *V) const { return DivergentValues.count(V); } + bool isDivergent(const Value *V) const; // Returns true if V is uniform/non-divergent. // @@ -53,9 +55,15 @@ void removeValue(const Value *V) { DivergentValues.erase(V); } private: + // Whether analysis should be performed by GPUDivergenceAnalysis. + bool shouldUseGPUDivergenceAnalysis(const Function &F) const; + + // (optional) handle to new DivergenceAnalysis + std::unique_ptr gpuDA; + // Stores all divergent values. DenseSet DivergentValues; }; } // End llvm namespace -#endif //LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H \ No newline at end of file +#endif //LLVM_ANALYSIS_LEGACY_DIVERGENCE_ANALYSIS_H Index: include/llvm/Analysis/Passes.h =================================================================== --- include/llvm/Analysis/Passes.h +++ include/llvm/Analysis/Passes.h @@ -61,10 +61,17 @@ //===--------------------------------------------------------------------===// // - // createDivergenceAnalysisPass - This pass determines which branches in a GPU + // createLegacyDivergenceAnalysisPass - This pass determines which branches in a GPU // program are divergent. // - FunctionPass *createDivergenceAnalysisPass(); + FunctionPass *createLegacyDivergenceAnalysisPass(); + + //===--------------------------------------------------------------------===// + // + // createLoopDivergencePrinterPass - This pass determines which branches and + // instructions in a loop are divergent. + // + FunctionPass *createLoopDivergencePrinterPass(); //===--------------------------------------------------------------------===// // Index: include/llvm/Analysis/SyncDependenceAnalysis.h =================================================================== --- /dev/null +++ include/llvm/Analysis/SyncDependenceAnalysis.h @@ -0,0 +1,88 @@ +//===- SyncDependenceAnalysis.h - Divergent Branch Dependence -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// \file +// This file defines the SyncDependenceAnalysis class, which computes for +// every divergent branch the set of phi nodes that the branch will make +// divergent. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ANALYSIS_SYNC_DEPENDENCE_ANALYSIS_H +#define LLVM_ANALYSIS_SYNC_DEPENDENCE_ANALYSIS_H + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/Analysis/LoopInfo.h" +#include + +namespace llvm { + +class BasicBlock; +class DominatorTree; +class Loop; +class PostDominatorTree; +class TerminatorInst; +class TerminatorInst; + +using ConstBlockSet = SmallPtrSet; + +/// \brief Relates points of divergent control to join points in +/// reducible CFGs. +/// +/// This analysis relates points of divergent control to points of converging +/// divergent control. The analysis requires all loops to be reducible. +class SyncDependenceAnalysis { + void visitSuccessor(const BasicBlock &succBlock, const Loop *termLoop, + const BasicBlock *defBlock); + +public: + bool inRegion(const BasicBlock &BB) const; + + ~SyncDependenceAnalysis(); + SyncDependenceAnalysis(const DominatorTree &DT, const PostDominatorTree &PDT, + const LoopInfo &LI); + + /// \brief Computes divergent join points and loop exits caused by divergent + /// in \p Term. + /// + /// The set of blocks which are reachable by disjoint paths from \p Term. + /// The set also contains loop exits if there two disjoin paths: + /// one from \p Term to the loop exit and another from @term to the loop + /// header. Those exit blocks are added to the returned set. If L is the + /// parent loop of @term and an exit of L is in the returned set then L is a + /// divergent loop. + const ConstBlockSet &join_blocks(const TerminatorInst &term); + + /// \brief Computes divergent join points and loop exits caused by the + /// divergent loop \p Loop. + /// + /// The set of blocks which are reachable by disjoint paths from the + /// loop exits of @loop. + /// This treats the loop as a single node in @loop's parent loop. + /// The returned set has the same properties as for join_blocks(TermInst&). + const ConstBlockSet &join_blocks(const Loop &Loop); + +private: + static ConstBlockSet EmptyBlockSet; + + ReversePostOrderTraversal FuncRPOT; + const DominatorTree &DT; + const PostDominatorTree &PDT; + const LoopInfo &LI; + + std::map> CachedLoopExitJoins; + std::map> + CachedBranchJoins; +}; + +} // namespace llvm + +#endif // LLVM_ANALYSIS_SYNC_DEPENDENCE_ANALYSIS_H Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -289,7 +289,7 @@ /// Returns whether V is a source of divergence. /// /// This function provides the target-dependent information for - /// the target-independent DivergenceAnalysis. DivergenceAnalysis first + /// the target-independent LegacyDivergenceAnalysis. LegacyDivergenceAnalysis first /// builds the dependency graph, and then runs the reachability algorithm /// starting with the sources of divergence. bool isSourceOfDivergence(const Value *V) const; Index: include/llvm/CodeGen/SelectionDAG.h =================================================================== --- include/llvm/CodeGen/SelectionDAG.h +++ include/llvm/CodeGen/SelectionDAG.h @@ -28,7 +28,7 @@ #include "llvm/ADT/iterator.h" #include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/CodeGen/DAGCombine.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/ISDOpcodes.h" @@ -229,7 +229,7 @@ LLVMContext *Context; CodeGenOpt::Level OptLevel; - DivergenceAnalysis * DA = nullptr; + LegacyDivergenceAnalysis * DA = nullptr; FunctionLoweringInfo * FLI = nullptr; /// The function-level optimization remark emitter. Used to emit remarks @@ -382,7 +382,7 @@ /// Prepare this SelectionDAG to process code in the given MachineFunction. void init(MachineFunction &NewMF, OptimizationRemarkEmitter &NewORE, Pass *PassPtr, const TargetLibraryInfo *LibraryInfo, - DivergenceAnalysis * Divergence); + LegacyDivergenceAnalysis * Divergence); void setFunctionLoweringInfo(FunctionLoweringInfo * FuncInfo) { FLI = FuncInfo; Index: include/llvm/CodeGen/TargetLowering.h =================================================================== --- include/llvm/CodeGen/TargetLowering.h +++ include/llvm/CodeGen/TargetLowering.h @@ -29,7 +29,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" -#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/CodeGen/DAGCombine.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/RuntimeLibcalls.h" @@ -2649,7 +2649,7 @@ virtual bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, - DivergenceAnalysis *DA) const { + LegacyDivergenceAnalysis *DA) const { return false; } Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -119,7 +119,6 @@ void initializeDependenceAnalysisWrapperPassPass(PassRegistry&); void initializeDetectDeadLanesPass(PassRegistry&); void initializeDivRemPairsLegacyPassPass(PassRegistry&); -void initializeDivergenceAnalysisPass(PassRegistry&); void initializeDomOnlyPrinterPass(PassRegistry&); void initializeDomOnlyViewerPass(PassRegistry&); void initializeDomPrinterPass(PassRegistry&); @@ -191,6 +190,7 @@ void initializeLazyMachineBlockFrequencyInfoPassPass(PassRegistry&); void initializeLazyValueInfoPrinterPass(PassRegistry&); void initializeLazyValueInfoWrapperPassPass(PassRegistry&); +void initializeLegacyDivergenceAnalysisPass(PassRegistry&); void initializeLegacyLICMPassPass(PassRegistry&); void initializeLegacyLoopSinkPassPass(PassRegistry&); void initializeLegalizerPass(PassRegistry&); @@ -211,6 +211,7 @@ void initializeLoopDataPrefetchLegacyPassPass(PassRegistry&); void initializeLoopDeletionLegacyPassPass(PassRegistry&); void initializeLoopDistributeLegacyPass(PassRegistry&); +void initializeLoopDivergencePrinterPass(PassRegistry&); void initializeLoopExtractorPass(PassRegistry&); void initializeLoopGuardWideningLegacyPassPass(PassRegistry&); void initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry&); Index: include/llvm/LinkAllPasses.h =================================================================== --- include/llvm/LinkAllPasses.h +++ include/llvm/LinkAllPasses.h @@ -94,7 +94,6 @@ (void) llvm::createDeadInstEliminationPass(); (void) llvm::createDeadStoreEliminationPass(); (void) llvm::createDependenceAnalysisWrapperPass(); - (void) llvm::createDivergenceAnalysisPass(); (void) llvm::createDomOnlyPrinterPass(); (void) llvm::createDomPrinterPass(); (void) llvm::createDomOnlyViewerPass(); @@ -121,7 +120,9 @@ (void) llvm::createInstructionCombiningPass(); (void) llvm::createInternalizePass(); (void) llvm::createLCSSAPass(); + (void) llvm::createLegacyDivergenceAnalysisPass(); (void) llvm::createLICMPass(); + (void) llvm::createLoopDivergencePrinterPass(); (void) llvm::createLoopSinkPass(); (void) llvm::createLazyValueInfoPass(); (void) llvm::createLoopExtractorPass(); Index: lib/Analysis/Analysis.cpp =================================================================== --- lib/Analysis/Analysis.cpp +++ lib/Analysis/Analysis.cpp @@ -39,7 +39,6 @@ initializeDependenceAnalysisWrapperPassPass(Registry); initializeDelinearizationPass(Registry); initializeDemandedBitsWrapperPassPass(Registry); - initializeDivergenceAnalysisPass(Registry); initializeDominanceFrontierWrapperPassPass(Registry); initializeDomViewerPass(Registry); initializeDomPrinterPass(Registry); @@ -58,7 +57,9 @@ initializeLazyBlockFrequencyInfoPassPass(Registry); initializeLazyValueInfoWrapperPassPass(Registry); initializeLazyValueInfoPrinterPass(Registry); + initializeLegacyDivergenceAnalysisPass(Registry); initializeLintPass(Registry); + initializeLoopDivergencePrinterPass(Registry); initializeLoopInfoWrapperPassPass(Registry); initializeMemDepPrinterPass(Registry); initializeMemDerefPrinterPass(Registry); Index: lib/Analysis/CMakeLists.txt =================================================================== --- lib/Analysis/CMakeLists.txt +++ lib/Analysis/CMakeLists.txt @@ -42,6 +42,7 @@ LazyBlockFrequencyInfo.cpp LazyCallGraph.cpp LazyValueInfo.cpp + LegacyDivergenceAnalysis.cpp Lint.cpp Loads.cpp LoopAccessAnalysis.cpp @@ -76,6 +77,7 @@ ScalarEvolutionAliasAnalysis.cpp ScalarEvolutionExpander.cpp ScalarEvolutionNormalization.cpp + SyncDependenceAnalysis.cpp SyntheticCountsUtils.cpp TargetLibraryInfo.cpp TargetTransformInfo.cpp Index: lib/Analysis/DivergenceAnalysis.cpp =================================================================== --- lib/Analysis/DivergenceAnalysis.cpp +++ lib/Analysis/DivergenceAnalysis.cpp @@ -7,9 +7,10 @@ // //===----------------------------------------------------------------------===// // -// This file implements divergence analysis which determines whether a branch -// in a GPU program is divergent.It can help branch optimizations such as jump -// threading and loop unswitching to make better decisions. +// This file implements a general divergence analysis for loop vectorization +// and GPU programs. It determines whether a branch in a loop or GPU program +// is divergent. It can help branch optimizations such as jump threading and +// loop unswitching to make better decisions. // // GPU programs typically use the SIMD execution model, where multiple threads // in the same execution group have to execute in lock-step. Therefore, if the @@ -25,16 +26,23 @@ // optimizations. // // This file defines divergence analysis which computes a conservative but -// non-trivial approximation of all divergent branches in a GPU program. It -// partially implements the approach described in +// non-trivial approximation of all divergent branches in a GPU program. This +// implementation is derived from the Vectorization Analysis of the Region +// Vectorizer (RV). That implementation in turn is based on the approach +// described in // -// Divergence Analysis -// Sampaio, Souza, Collange, Pereira -// TOPLAS '13 +// Improving Performance of OpenCL on CPUs +// Ralf Karrenberg and Sebastian Hack +// CC '12 // -// The divergence analysis identifies the sources of divergence (e.g., special -// variables that hold the thread ID), and recursively marks variables that are -// data or sync dependent on a source of divergence as divergent. +// The DivergenceAnalysis implementation is generic in the sense that it does +// not itself identify original sources of divergence. +// Instead specialized adapter classes, (LoopDivergenceAnalysis) for loops and +// (GPUDivergenceAnalysis) for GPU programs, identify the sources of divergence +// (e.g., special variables that hold the thread ID or the iteration variable). +// +// The generic implementation propagates divergence to variables that are data +// or sync dependent on a source of divergence. // // While data dependency is a well-known concept, the notion of sync dependency // is worth more explanation. Sync dependence characterizes the control flow @@ -54,287 +62,476 @@ // because the branch "br i1 %cond" depends on %tid and affects which value %a // is assigned to. // -// The current implementation has the following limitations: +// The sync dependence detection (which branch induces divergence in which join +// points) is implemented in the SyncDependenceAnalysis. +// +// The current DivergenceAnalysis implementation has the following limitations: // 1. intra-procedural. It conservatively considers the arguments of a // non-kernel-entry function and the return value of a function call as // divergent. // 2. memory as black box. It conservatively considers values loaded from // generic or local address as divergent. This can be improved by leveraging -// pointer analysis. +// pointer analysis and/or by modelling non-escaping memory objects in SSA +// as done in RV. // //===----------------------------------------------------------------------===// #include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/Passes.h" #include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Value.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include + using namespace llvm; -#define DEBUG_TYPE "divergence" - -namespace { - -class DivergencePropagator { -public: - DivergencePropagator(Function &F, TargetTransformInfo &TTI, DominatorTree &DT, - PostDominatorTree &PDT, DenseSet &DV) - : F(F), TTI(TTI), DT(DT), PDT(PDT), DV(DV) {} - void populateWithSourcesOfDivergence(); - void propagate(); - -private: - // A helper function that explores data dependents of V. - void exploreDataDependency(Value *V); - // A helper function that explores sync dependents of TI. - void exploreSyncDependency(TerminatorInst *TI); - // Computes the influence region from Start to End. This region includes all - // basic blocks on any simple path from Start to End. - void computeInfluenceRegion(BasicBlock *Start, BasicBlock *End, - DenseSet &InfluenceRegion); - // Finds all users of I that are outside the influence region, and add these - // users to Worklist. - void findUsersOutsideInfluenceRegion( - Instruction &I, const DenseSet &InfluenceRegion); - - Function &F; - TargetTransformInfo &TTI; - DominatorTree &DT; - PostDominatorTree &PDT; - std::vector Worklist; // Stack for DFS. - DenseSet &DV; // Stores all divergent values. -}; - -void DivergencePropagator::populateWithSourcesOfDivergence() { - Worklist.clear(); - DV.clear(); - for (auto &I : instructions(F)) { - if (TTI.isSourceOfDivergence(&I)) { - Worklist.push_back(&I); - DV.insert(&I); - } +#define DEBUG_TYPE "divergence-analysis" + +// class DivergenceAnalysis +DivergenceAnalysis::DivergenceAnalysis( + const Function &F, const Loop *RegionLoop, const DominatorTree &DT, + const LoopInfo &LI, SyncDependenceAnalysis &SDA, bool IsLCSSAForm) + : F(F), RegionLoop(RegionLoop), DT(DT), LI(LI), SDA(SDA), + IsLCSSAForm(IsLCSSAForm) {} + +void DivergenceAnalysis::markDivergent(const Value &DivVal) { + assert(isa(DivVal) || isa(DivVal)); + assert(!isAlwaysUniform(DivVal) && "cannot be a divergent"); + DivergentValues.insert(&DivVal); +} + +void DivergenceAnalysis::addUniformOverride(const Value &UniVal) { + UniformOverrides.insert(&UniVal); +} + +bool DivergenceAnalysis::updateTerminator(const TerminatorInst &Term) const { + if (Term.getNumSuccessors() <= 1) + return false; + if (auto *BranchTerm = dyn_cast(&Term)) { + assert(BranchTerm->isConditional()); + return isDivergent(*BranchTerm->getCondition()); } - for (auto &Arg : F.args()) { - if (TTI.isSourceOfDivergence(&Arg)) { - Worklist.push_back(&Arg); - DV.insert(&Arg); - } + if (auto *SwitchTerm = dyn_cast(&Term)) { + return isDivergent(*SwitchTerm->getCondition()); } + if (isa(Term)) { + return false; // ignore abnormal executions through landingpad + } + + abort(); } -void DivergencePropagator::exploreSyncDependency(TerminatorInst *TI) { - // Propagation rule 1: if branch TI is divergent, all PHINodes in TI's - // immediate post dominator are divergent. This rule handles if-then-else - // patterns. For example, - // - // if (tid < 5) - // a1 = 1; - // else - // a2 = 2; - // a = phi(a1, a2); // sync dependent on (tid < 5) - BasicBlock *ThisBB = TI->getParent(); - - // Unreachable blocks may not be in the dominator tree. - if (!DT.isReachableFromEntry(ThisBB)) - return; +bool DivergenceAnalysis::updateNormalInstruction(const Instruction &I) const { + // TODO function calls with side effects, etc + for (const auto &Op : I.operands()) { + if (isDivergent(*Op)) + return true; + } + return false; +} - // If the function has no exit blocks or doesn't reach any exit blocks, the - // post dominator may be null. - DomTreeNode *ThisNode = PDT.getNode(ThisBB); - if (!ThisNode) - return; +bool DivergenceAnalysis::isTemporalDivergent(const BasicBlock &ObservingBlock, + const Value &Val) const { + const auto *Inst = dyn_cast(&Val); + if (!Inst) + return false; + // check whether any divergent loop carrying @val terminates before control + // proceeds to @observingBlock + for (const auto *Loop = LI.getLoopFor(Inst->getParent()); + Loop != RegionLoop && !Loop->contains(&ObservingBlock); + Loop = Loop->getParentLoop()) { + if (DivergentLoops.find(Loop) != DivergentLoops.end()) + return true; + } - BasicBlock *IPostDom = ThisNode->getIDom()->getBlock(); - if (IPostDom == nullptr) - return; + return false; +} - for (auto I = IPostDom->begin(); isa(I); ++I) { - // A PHINode is uniform if it returns the same value no matter which path is - // taken. - if (!cast(I)->hasConstantOrUndefValue() && DV.insert(&*I).second) - Worklist.push_back(&*I); +bool DivergenceAnalysis::updatePHINode(const PHINode &Phi) const { + // joining divergent disjoint path in @phi parent block + if (!Phi.hasConstantOrUndefValue() && isJoinDivergent(*Phi.getParent())) { + return true; } - // Propagation rule 2: if a value defined in a loop is used outside, the user - // is sync dependent on the condition of the loop exits that dominate the - // user. For example, - // - // int i = 0; - // do { - // i++; - // if (foo(i)) ... // uniform - // } while (i < tid); - // if (bar(i)) ... // divergent - // - // A program may contain unstructured loops. Therefore, we cannot leverage - // LoopInfo, which only recognizes natural loops. + // An incoming value could be divergent by itself. + // Otherwise, an incoming value could be uniform within the loop + // that carries its definition but it may appear divergent + // from outside the loop. This happens when divergent loop exits + // drop definitions of that uniform value in different iterations. // - // The algorithm used here handles both natural and unstructured loops. Given - // a branch TI, we first compute its influence region, the union of all simple - // paths from TI to its immediate post dominator (IPostDom). Then, we search - // for all the values defined in the influence region but used outside. All - // these users are sync dependent on TI. - DenseSet InfluenceRegion; - computeInfluenceRegion(ThisBB, IPostDom, InfluenceRegion); - // An insight that can speed up the search process is that all the in-region - // values that are used outside must dominate TI. Therefore, instead of - // searching every basic blocks in the influence region, we search all the - // dominators of TI until it is outside the influence region. - BasicBlock *InfluencedBB = ThisBB; - while (InfluenceRegion.count(InfluencedBB)) { - for (auto &I : *InfluencedBB) - findUsersOutsideInfluenceRegion(I, InfluenceRegion); - DomTreeNode *IDomNode = DT.getNode(InfluencedBB)->getIDom(); - if (IDomNode == nullptr) - break; - InfluencedBB = IDomNode->getBlock(); + // for (int i = 0; i < n; ++i) { // 'i' is uniform inside the loop + // if (i % thread_id == 0) break; // divergent loop exit + // } + // int divI = i; // divI is divergent + for (size_t i = 0; i < Phi.getNumIncomingValues(); ++i) { + const auto *InVal = Phi.getIncomingValue(i); + if (isDivergent(*Phi.getIncomingValue(i)) || + isTemporalDivergent(*Phi.getParent(), *InVal)) { + return true; + } } + return false; +} + +bool DivergenceAnalysis::inRegion(const Instruction &I) const { + return I.getParent() && inRegion(*I.getParent()); } -void DivergencePropagator::findUsersOutsideInfluenceRegion( - Instruction &I, const DenseSet &InfluenceRegion) { - for (User *U : I.users()) { - Instruction *UserInst = cast(U); - if (!InfluenceRegion.count(UserInst->getParent())) { - if (DV.insert(UserInst).second) - Worklist.push_back(UserInst); +bool DivergenceAnalysis::inRegion(const BasicBlock &BB) const { + return (!RegionLoop && BB.getParent() == &F) || RegionLoop->contains(&BB); +} + +// marks all users of loop-carried values of the loop headed by @loopHeader as +// divergent +void DivergenceAnalysis::taintLoopLiveOuts(const BasicBlock &LoopHeader) { + auto *DivLoop = LI.getLoopFor(&LoopHeader); + assert(DivLoop && "loopHeader is not actually part of a loop"); + + SmallVector TaintStack; + DivLoop->getExitBlocks(TaintStack); + + // Otherwise potential users of loop-carried values could be anywhere in the + // dominance region of @DivLoop (including its fringes for phi nodes) + DenseSet Visited; + for (auto *Block : TaintStack) { + Visited.insert(Block); + } + Visited.insert(&LoopHeader); + + while (!TaintStack.empty()) { + auto *UserBlock = TaintStack.back(); + TaintStack.pop_back(); + + // don't spread divergence beyond the region + if (!inRegion(*UserBlock)) + continue; + + assert(!DivLoop->contains(UserBlock) && + "irreducible control flow detected"); + + // phi nodes at the fringes of the dominance region + if (!DT.dominates(&LoopHeader, UserBlock)) { + // all PHI nodes of @userBlock become divergent + for (auto &Phi : UserBlock->phis()) { + Worklist.push_back(&Phi); + } + continue; } + + // taint outside users of values carried by @DivLoop + for (auto &I : *UserBlock) { + if (isAlwaysUniform(I)) + continue; + if (isDivergent(I)) + continue; + + for (auto &Op : I.operands()) { + auto *OpInst = dyn_cast(&Op); + if (!OpInst) + continue; + if (DivLoop->contains(OpInst->getParent())) { + markDivergent(I); + pushUsers(I); + break; + } + } + } + + // visit all blocks in the dominance region + for (auto *SuccBlock : successors(UserBlock)) { + if (!Visited.insert(SuccBlock).second) { + continue; + } + TaintStack.push_back(SuccBlock); + } + } +} + +void DivergenceAnalysis::pushPHINodes(const BasicBlock &Block) { + for (const auto &Phi : Block.phis()) { + if (isDivergent(Phi)) + continue; + Worklist.push_back(&Phi); } } -// A helper function for computeInfluenceRegion that adds successors of "ThisBB" -// to the influence region. -static void -addSuccessorsToInfluenceRegion(BasicBlock *ThisBB, BasicBlock *End, - DenseSet &InfluenceRegion, - std::vector &InfluenceStack) { - for (BasicBlock *Succ : successors(ThisBB)) { - if (Succ != End && InfluenceRegion.insert(Succ).second) - InfluenceStack.push_back(Succ); +void DivergenceAnalysis::pushUsers(const Value &V) { + for (const auto *User : V.users()) { + const auto *UserInst = dyn_cast(User); + if (!UserInst) + continue; + + if (isDivergent(*UserInst)) + continue; + + // only compute divergent inside loop + if (!inRegion(*UserInst)) + continue; + Worklist.push_back(UserInst); } } -void DivergencePropagator::computeInfluenceRegion( - BasicBlock *Start, BasicBlock *End, - DenseSet &InfluenceRegion) { - assert(PDT.properlyDominates(End, Start) && - "End does not properly dominate Start"); - - // The influence region starts from the end of "Start" to the beginning of - // "End". Therefore, "Start" should not be in the region unless "Start" is in - // a loop that doesn't contain "End". - std::vector InfluenceStack; - addSuccessorsToInfluenceRegion(Start, End, InfluenceRegion, InfluenceStack); - while (!InfluenceStack.empty()) { - BasicBlock *BB = InfluenceStack.back(); - InfluenceStack.pop_back(); - addSuccessorsToInfluenceRegion(BB, End, InfluenceRegion, InfluenceStack); +bool DivergenceAnalysis::propagateJoinDivergence(const BasicBlock &JoinBlock, + const Loop *BranchLoop) { + LLVM_DEBUG(dbgs() << "\tpropJoinDiv " << JoinBlock.getName() << "\n"); + + // ignore divergence outside the region + if (!inRegion(JoinBlock)) { + return false; } + + // push non-divergent phi nodes in @joinBlock to the worklist + pushPHINodes(JoinBlock); + + // @joinBlock is a divergent loop exit + if (BranchLoop && !BranchLoop->contains(&JoinBlock)) { + return true; + } + + // disjoint-paths divergent at @joinBlock + markBlockJoinDivergent(JoinBlock); + return false; } -void DivergencePropagator::exploreDataDependency(Value *V) { - // Follow def-use chains of V. - for (User *U : V->users()) { - Instruction *UserInst = cast(U); - if (!TTI.isAlwaysUniform(U) && DV.insert(UserInst).second) - Worklist.push_back(UserInst); +void DivergenceAnalysis::propagateBranchDivergence(const TerminatorInst &Term) { + LLVM_DEBUG(dbgs() << "propBranchDiv " << Term.getParent()->getName() << "\n"); + + markDivergent(Term); + + const auto *BranchLoop = LI.getLoopFor(Term.getParent()); + + // whether there is a divergent loop exit from @branchLoop (if any) + bool IsBranchLoopDivergent = false; + + // iterate over all blocks reachable by disjoint from @term within the loop + // also iterates over loop exits that become divergent due to @term. + for (const auto *JoinBlock : SDA.join_blocks(Term)) { + IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop); + } + + // @branch loop is a divergent loop due to the divergent branch in @term + if (IsBranchLoopDivergent) { + assert(BranchLoop); + if (!DivergentLoops.insert(BranchLoop).second) { + return; + } + propagateLoopDivergence(*BranchLoop); } } -void DivergencePropagator::propagate() { - // Traverse the dependency graph using DFS. +void DivergenceAnalysis::propagateLoopDivergence(const Loop &ExitingLoop) { + LLVM_DEBUG(dbgs() << "propLoopDiv " << ExitingLoop.getName() << "\n"); + + // don't propagate beyond region + if (!inRegion(*ExitingLoop.getHeader())) + return; + + const auto *BranchLoop = ExitingLoop.getParentLoop(); + + // Uses of loop-carried values could occur anywhere + // within the dominance region of the definition. All loop-carried + // definitions are dominated by the loop header (reducible control). + // Thus all users have to be in the dominance region of the loop header, + // except PHI nodes that can also live at the fringes of the dom region + // (incoming defining value). + if (!IsLCSSAForm) + taintLoopLiveOuts(*ExitingLoop.getHeader()); + + // whether there is a divergent loop exit from @branchLoop (if any) + bool IsBranchLoopDivergent = false; + + // iterate over all blocks reachable by disjoint paths from exits of + // @exitingLoop also iterates over loop exits (of @branchLoop) that in turn + // become divergent. + for (const auto *JoinBlock : SDA.join_blocks(ExitingLoop)) { + IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop); + } + + // @branch loop is a divergent due to divergent loop exit in @exitingLoop + if (IsBranchLoopDivergent) { + assert(BranchLoop); + if (!DivergentLoops.insert(BranchLoop).second) { + return; + } + propagateLoopDivergence(*BranchLoop); + } +} + +void DivergenceAnalysis::compute() { + for (auto *DivVal : DivergentValues) { + pushUsers(*DivVal); + } + + // propagate divergence while (!Worklist.empty()) { - Value *V = Worklist.back(); + const Instruction &I = *Worklist.back(); Worklist.pop_back(); - if (TerminatorInst *TI = dyn_cast(V)) { - // Terminators with less than two successors won't introduce sync - // dependency. Ignore them. - if (TI->getNumSuccessors() > 1) - exploreSyncDependency(TI); + + // maintain uniformity of overrides + if (isAlwaysUniform(I)) + continue; + + bool WasDivergent = isDivergent(I); + if (WasDivergent) + continue; + + // propagate divergence caused by terminator + if (isa(I)) { + auto &Term = cast(I); + if (updateTerminator(Term)) { + // propagate control divergence to affected instructions + propagateBranchDivergence(Term); + continue; + } + } + + // update divergence of I due to divergent operands + bool DivergentUpd = false; + const auto *Phi = dyn_cast(&I); + if (Phi) { + DivergentUpd = updatePHINode(*Phi); + } else { + DivergentUpd = updateNormalInstruction(I); + } + + // propagate value divergence to users + if (DivergentUpd) { + markDivergent(I); + pushUsers(I); } - exploreDataDependency(V); } } -} /// end namespace anonymous +bool DivergenceAnalysis::isAlwaysUniform(const Value &V) const { + return UniformOverrides.find(&V) != UniformOverrides.end(); +} -// Register this pass. -char DivergenceAnalysis::ID = 0; -INITIALIZE_PASS_BEGIN(DivergenceAnalysis, "divergence", "Divergence Analysis", - false, true) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) -INITIALIZE_PASS_END(DivergenceAnalysis, "divergence", "Divergence Analysis", - false, true) +bool DivergenceAnalysis::isDivergent(const Value &V) const { + return DivergentValues.find(&V) != DivergentValues.end(); +} -FunctionPass *llvm::createDivergenceAnalysisPass() { - return new DivergenceAnalysis(); +void DivergenceAnalysis::print(raw_ostream &OS, const Module *) const { + if (DivergentValues.empty()) + return; + // iterate instructions using instructions() to ensure a deterministic order. + for (auto &I : instructions(F)) { + if (isDivergent(I)) + OS << "DIVERGENT:" << I << '\n'; + } } -void DivergenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired(); - AU.addRequired(); - AU.setPreservesAll(); +// class LoopDivergenceAnalysis +LoopDivergenceAnalysis::LoopDivergenceAnalysis(const DominatorTree &DT, + const LoopInfo &LI, + SyncDependenceAnalysis &SDA, + const Loop &Loop) + : DA(*Loop.getHeader()->getParent(), &Loop, DT, LI, SDA, true) { + for (const auto &Phi : Loop.getHeader()->phis()) { + DA.markDivergent(Phi); + } + + // after the scalar remainder loop is extracted, the loop exit condition will + // be uniform + auto LoopExitingInst = Loop.getExitingBlock()->getTerminator(); + auto LoopExitCond = cast(LoopExitingInst)->getCondition(); + DA.addUniformOverride(*LoopExitCond); + + DA.compute(); } -bool DivergenceAnalysis::runOnFunction(Function &F) { - auto *TTIWP = getAnalysisIfAvailable(); - if (TTIWP == nullptr) - return false; +bool LoopDivergenceAnalysis::isDivergent(const Value &V) const { + return DA.isDivergent(V); +} - TargetTransformInfo &TTI = TTIWP->getTTI(F); - // Fast path: if the target does not have branch divergence, we do not mark - // any branch as divergent. - if (!TTI.hasBranchDivergence()) - return false; +void LoopDivergenceAnalysis::print(raw_ostream &OS, const Module *Mod) const { + OS << "Divergence of loop " << DA.getRegionLoop()->getName() << " {\n"; + DA.print(OS, Mod); + OS << "}\n"; +} + +// class LoopDivergencePrinter +bool LoopDivergencePrinter::runOnFunction(Function &F) { + const PostDominatorTree &PDT = + getAnalysis().getPostDomTree(); + const DominatorTree &DT = + getAnalysis().getDomTree(); + const LoopInfo &LI = getAnalysis().getLoopInfo(); + SDA = make_unique(DT, PDT, LI); + + for (auto &BB : F) { + auto *Loop = LI.getLoopFor(&BB); + if (!Loop || Loop->getHeader() != &BB) + continue; + LoopDivInfo.push_back( + make_unique(DT, LI, *SDA, *Loop)); + } - DivergentValues.clear(); - auto &PDT = getAnalysis().getPostDomTree(); - DivergencePropagator DP(F, TTI, - getAnalysis().getDomTree(), - PDT, DivergentValues); - DP.populateWithSourcesOfDivergence(); - DP.propagate(); - LLVM_DEBUG( - dbgs() << "\nAfter divergence analysis on " << F.getName() << ":\n"; - print(dbgs(), F.getParent()) - ); return false; } -void DivergenceAnalysis::print(raw_ostream &OS, const Module *) const { - if (DivergentValues.empty()) - return; - const Value *FirstDivergentValue = *DivergentValues.begin(); - const Function *F; - if (const Argument *Arg = dyn_cast(FirstDivergentValue)) { - F = Arg->getParent(); - } else if (const Instruction *I = - dyn_cast(FirstDivergentValue)) { - F = I->getParent()->getParent(); - } else { - llvm_unreachable("Only arguments and instructions can be divergent"); +void LoopDivergencePrinter::print(raw_ostream &OS, const Module *Mod) const { + for (auto &DivInfo : LoopDivInfo) { + DivInfo->print(OS, Mod); } +} + +// Register this pass. +char LoopDivergencePrinter::ID = 0; +INITIALIZE_PASS_BEGIN(LoopDivergencePrinter, "loop-divergence", + "Loop Divergence Printer", false, true) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_END(LoopDivergencePrinter, "loop-divergence", + "Loop Divergence Printer", false, true) - // Dumps all divergent values in F, arguments and then instructions. - for (auto &Arg : F->args()) { - OS << (DivergentValues.count(&Arg) ? "DIVERGENT: " : " "); - OS << Arg << "\n"; +FunctionPass *llvm::createLoopDivergencePrinterPass() { + return new LoopDivergencePrinter(); +} + +void LoopDivergencePrinter::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.setPreservesAll(); +} + +// class GPUDivergenceAnalysis +GPUDivergenceAnalysis::GPUDivergenceAnalysis(Function &F, + const DominatorTree &DT, + const PostDominatorTree &PDT, + const LoopInfo &LI, + const TargetTransformInfo &TTI) + : SDA(DT, PDT, LI), DA(F, nullptr, DT, LI, SDA, false) { + for (auto &I : instructions(F)) { + if (TTI.isSourceOfDivergence(&I)) { + DA.markDivergent(I); + } else if (TTI.isAlwaysUniform(&I)) { + DA.addUniformOverride(I); + } } - // Iterate instructions using instructions() to ensure a deterministic order. - for (auto BI = F->begin(), BE = F->end(); BI != BE; ++BI) { - auto &BB = *BI; - OS << "\n " << BB.getName() << ":\n"; - for (auto &I : BB.instructionsWithoutDebug()) { - OS << (DivergentValues.count(&I) ? "DIVERGENT: " : " "); - OS << I << "\n"; + for (auto &Arg : F.args()) { + if (TTI.isSourceOfDivergence(&Arg)) { + DA.markDivergent(Arg); } } - OS << "\n"; + + DA.compute(); +} + +bool GPUDivergenceAnalysis::isDivergent(const Value &val) const { + return DA.isDivergent(val); +} + +void GPUDivergenceAnalysis::print(raw_ostream &OS, const Module *mod) const { + OS << "Divergence of kernel " << DA.getFunction().getName() << " {\n"; + DA.print(OS, mod); + OS << "}\n"; } Index: lib/Analysis/LegacyDivergenceAnalysis.cpp =================================================================== --- lib/Analysis/LegacyDivergenceAnalysis.cpp +++ lib/Analysis/LegacyDivergenceAnalysis.cpp @@ -1,4 +1,5 @@ -//===- DivergenceAnalysis.cpp --------- Divergence Analysis Implementation -==// +//===- LegacyDivergenceAnalysis.cpp --------- Legacy Divergence Analysis +//Implementation -==// // // The LLVM Compiler Infrastructure // @@ -64,7 +65,10 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/Analysis/CFG.h" #include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/Passes.h" #include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -79,6 +83,12 @@ #define DEBUG_TYPE "divergence" +// transparently use the GPUDivergenceAnalysis +static cl::opt UseGPUDA("use-gpu-divergence-analysis", cl::init(false), + cl::Hidden, + cl::desc("turn the LegacyDivergenceAnalysis into " + "a wrapper for GPUDivergenceAnalysis")); + namespace { class DivergencePropagator { @@ -262,28 +272,44 @@ } } -} /// end namespace anonymous +} // namespace // Register this pass. -char DivergenceAnalysis::ID = 0; -INITIALIZE_PASS_BEGIN(DivergenceAnalysis, "divergence", "Divergence Analysis", - false, true) +char LegacyDivergenceAnalysis::ID = 0; +INITIALIZE_PASS_BEGIN(LegacyDivergenceAnalysis, "divergence", + "Legacy Divergence Analysis", false, true) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) -INITIALIZE_PASS_END(DivergenceAnalysis, "divergence", "Divergence Analysis", - false, true) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_END(LegacyDivergenceAnalysis, "divergence", + "Legacy Divergence Analysis", false, true) -FunctionPass *llvm::createDivergenceAnalysisPass() { - return new DivergenceAnalysis(); +FunctionPass *llvm::createLegacyDivergenceAnalysisPass() { + return new LegacyDivergenceAnalysis(); } -void DivergenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { +void LegacyDivergenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.addRequired(); + if (UseGPUDA) + AU.addRequired(); AU.setPreservesAll(); } -bool DivergenceAnalysis::runOnFunction(Function &F) { +bool LegacyDivergenceAnalysis::shouldUseGPUDivergenceAnalysis( + const Function &F) const { + if (!UseGPUDA) + return false; + + // GPUDivergenceAnalysis requires a reducible CFG. + auto &LI = getAnalysis().getLoopInfo(); + using RPOTraversal = ReversePostOrderTraversal; + RPOTraversal FuncRPOT(&F); + return !containsIrreducibleCFG(FuncRPOT, LI); +} + +bool LegacyDivergenceAnalysis::runOnFunction(Function &F) { auto *TTIWP = getAnalysisIfAvailable(); if (TTIWP == nullptr) return false; @@ -295,36 +321,59 @@ return false; DivergentValues.clear(); + gpuDA = nullptr; + + auto &DT = getAnalysis().getDomTree(); auto &PDT = getAnalysis().getPostDomTree(); - DivergencePropagator DP(F, TTI, - getAnalysis().getDomTree(), - PDT, DivergentValues); - DP.populateWithSourcesOfDivergence(); - DP.propagate(); - LLVM_DEBUG( - dbgs() << "\nAfter divergence analysis on " << F.getName() << ":\n"; - print(dbgs(), F.getParent()) - ); + + if (shouldUseGPUDivergenceAnalysis(F)) { + // run the new GPU divergence analysis + auto &LI = getAnalysis().getLoopInfo(); + gpuDA = llvm::make_unique(F, DT, PDT, LI, TTI); + + } else { + // run LLVM's existing DivergenceAnalysis + DivergencePropagator DP(F, TTI, DT, PDT, DivergentValues); + DP.populateWithSourcesOfDivergence(); + DP.propagate(); + } + + LLVM_DEBUG(dbgs() << "\nAfter divergence analysis on " << F.getName() + << ":\n"; + print(dbgs(), F.getParent())); + return false; } -void DivergenceAnalysis::print(raw_ostream &OS, const Module *) const { - if (DivergentValues.empty()) +bool LegacyDivergenceAnalysis::isDivergent(const Value *V) const { + if (gpuDA) { + return gpuDA->isDivergent(*V); + } + return DivergentValues.count(V); +} + +void LegacyDivergenceAnalysis::print(raw_ostream &OS, const Module *) const { + if ((!gpuDA || !gpuDA->hasDivergence()) && DivergentValues.empty()) return; - const Value *FirstDivergentValue = *DivergentValues.begin(); + const Function *F; - if (const Argument *Arg = dyn_cast(FirstDivergentValue)) { - F = Arg->getParent(); - } else if (const Instruction *I = - dyn_cast(FirstDivergentValue)) { - F = I->getParent()->getParent(); - } else { - llvm_unreachable("Only arguments and instructions can be divergent"); + if (!DivergentValues.empty()) { + const Value *FirstDivergentValue = *DivergentValues.begin(); + if (const Argument *Arg = dyn_cast(FirstDivergentValue)) { + F = Arg->getParent(); + } else if (const Instruction *I = + dyn_cast(FirstDivergentValue)) { + F = I->getParent()->getParent(); + } else { + llvm_unreachable("Only arguments and instructions can be divergent"); + } + } else if (gpuDA) { + F = &gpuDA->getFunction(); } // Dumps all divergent values in F, arguments and then instructions. for (auto &Arg : F->args()) { - OS << (DivergentValues.count(&Arg) ? "DIVERGENT: " : " "); + OS << (isDivergent(&Arg) ? "DIVERGENT: " : " "); OS << Arg << "\n"; } // Iterate instructions using instructions() to ensure a deterministic order. @@ -332,7 +381,7 @@ auto &BB = *BI; OS << "\n " << BB.getName() << ":\n"; for (auto &I : BB.instructionsWithoutDebug()) { - OS << (DivergentValues.count(&I) ? "DIVERGENT: " : " "); + OS << (isDivergent(&I) ? "DIVERGENT: " : " "); OS << I << "\n"; } } Index: lib/Analysis/SyncDependenceAnalysis.cpp =================================================================== --- /dev/null +++ lib/Analysis/SyncDependenceAnalysis.cpp @@ -0,0 +1,381 @@ +//===- SyncDependenceAnalysis.cpp - Divergent Branch Dependence Calculation +//--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements an algorithm that returns for a divergent branch +// the set of basic blocks whose phi nodes become divergent due to divergent +// control. These are the blocks that are reachable by two disjoint paths from +// the branch or loop exits that have a reaching path that is disjoint from a +// path to the loop latch. +// +// The SyncDependenceAnalysis is used in the DivergenceAnalysis to model +// control-induced divergence in phi nodes. +// +// -- Summary -- +// The SyncDependenceAnalysis lazily computes sync dependences [3]. +// The analysis evaluates the disjoint path criterion [2] by a reduction +// to SSA construction. The SSA construction algorithm is implemented as +// a simple data-flow analysis [1]. +// +// [1] "A Simple, Fast Dominance Algorithm", SPI '01, Cooper, Harvey and Kennedy +// [2] "Efficiently Computing Static Single Assignment Form +// and the Control Dependence Graph", TOPLAS '91, +// Cytron, Ferrante, Rosen, Wegman and Zadeck +// [3] "Improving Performance of OpenCL on CPUs", CC '12, Karrenberg and Hack +// [4] "Divergence Analysis", TOPLAS '13, Sampaio, Souza, Collange and Pereira +// +// -- Sync dependence -- +// Sync dependence [4] characterizes the control flow aspect of the +// propagation of branch divergence. For example, +// +// %cond = icmp slt i32 %tid, 10 +// br i1 %cond, label %then, label %else +// then: +// br label %merge +// else: +// br label %merge +// merge: +// %a = phi i32 [ 0, %then ], [ 1, %else ] +// +// Suppose %tid holds the thread ID. Although %a is not data dependent on %tid +// because %tid is not on its use-def chains, %a is sync dependent on %tid +// because the branch "br i1 %cond" depends on %tid and affects which value %a +// is assigned to. +// +// -- Reduction to SSA construction -- +// There are two disjoint paths from A to X, if a certain variant of SSA +// construction places a phi node in X under the following set-up scheme [2]. +// +// This variant of SSA construction ignores incoming undef values. +// That is paths from the entry without a definition do not result in +// phi nodes. +// +// entry +// / \ +// A \ +// / \ Y +// B C / +// \ / \ / +// D E +// \ / +// F +// Assume that A contains a divergent branch. We are interested +// in the set of all blocks where each block is reachable from A +// via two disjoint paths. This would be the set {D, F} in this +// case. +// To generally reduce this query to SSA construction we introduce +// a virtual variable x and assign to x different values in each +// successor block of A. +// entry +// / \ +// A \ +// / \ Y +// x = 0 x = 1 / +// \ / \ / +// D E +// \ / +// F +// Our flavor of SSA construction for x will construct the following +// entry +// / \ +// A \ +// / \ Y +// x0 = 0 x1 = 1 / +// \ / \ / +// x2=phi E +// \ / +// x3=phi +// The blocks D and F contain phi nodes and are thus each reachable +// by two disjoins paths from A. +// +// -- Remarks -- +// In case of loop exits we need to check the disjoint path criterion for loops +// [2]. To this end, we check whether the definition of x differs between the +// loop exit and the loop header (_after_ SSA construction). +// +//===----------------------------------------------------------------------===// +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/Analysis/SyncDependenceAnalysis.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" + +#include +#include + +#define DEBUG_TYPE "sync-dependence" + +namespace llvm { + +ConstBlockSet SyncDependenceAnalysis::EmptyBlockSet; + +SyncDependenceAnalysis::SyncDependenceAnalysis(const DominatorTree &DT, + const PostDominatorTree &PDT, + const LoopInfo &LI) + : FuncRPOT(DT.getRoot()->getParent()), DT(DT), PDT(PDT), LI(LI) {} + +SyncDependenceAnalysis::~SyncDependenceAnalysis() {} + +using FunctionRPOT = ReversePostOrderTraversal; + +// divergence propagator for reducible CFGs +struct DivergencePropagator { + const FunctionRPOT &FuncRPOT; + const DominatorTree &DT; + const PostDominatorTree &PDT; + const LoopInfo &LI; + + // identified join points + std::unique_ptr JoinBlocks; + + // reached loop exits (by a path disjoint to a path to the loop header) + SmallPtrSet ReachedLoopExits; + + // if DefMap[B] == C then C is the dominating definition at block B + // if DefMap[B] ~ undef then we haven't seen B yet + // if DefMap[B] == B then B is a join point of disjoint paths from X or B is + // an immediate successor of X (initial value). + using DefiningBlockMap = std::map; + DefiningBlockMap DefMap; + + // all blocks with pending visits + std::unordered_set PendingUpdates; + + DivergencePropagator(const FunctionRPOT &FuncRPOT, const DominatorTree &DT, + const PostDominatorTree &PDT, const LoopInfo &LI) + : FuncRPOT(FuncRPOT), DT(DT), PDT(PDT), LI(LI), + JoinBlocks(new ConstBlockSet) {} + + // set the definition at @block and mark @block as pending for a visit + void addPending(const BasicBlock &Block, const BasicBlock &DefBlock) { + bool WasAdded = DefMap.emplace(&Block, &DefBlock).second; + if (WasAdded) + PendingUpdates.insert(&Block); + } + + void printDefs(raw_ostream &Out) { + Out << "Propagator::DefMap {\n"; + for (const auto *Block : FuncRPOT) { + auto It = DefMap.find(Block); + Out << Block->getName() << " : "; + if (It == DefMap.end()) { + Out << "\n"; + } else { + const auto *DefBlock = It->second; + Out << (DefBlock ? DefBlock->getName() : "") << "\n"; + } + } + Out << "}\n"; + } + + // process @succBlock with reaching definition @defBlock + // the original divergent branch was in @parentLoop (if any) + void visitSuccessor(const BasicBlock &SuccBlock, const Loop *ParentLoop, + const BasicBlock &DefBlock) { + + // @succBlock is a loop exit + if (ParentLoop && !ParentLoop->contains(&SuccBlock)) { + DefMap.emplace(&SuccBlock, &DefBlock); + ReachedLoopExits.insert(&SuccBlock); + return; + } + + // first reaching def? + auto ItLastDef = DefMap.find(&SuccBlock); + if (ItLastDef == DefMap.end()) { + addPending(SuccBlock, DefBlock); + return; + } + + // a join of at least two definitions + if (ItLastDef->second != &DefBlock) { + // do we know this join already? + if (!JoinBlocks->insert(&SuccBlock).second) + return; + + // update the definition + addPending(SuccBlock, SuccBlock); + } + } + + // find all blocks reachable by two disjoint paths from @rootTerm. + // This method works for both divergent TerminatorInsts and loops with + // divergent exits. + // @rootBlock is either the block containing the branch or the header of the + // divergent loop. + // @nodeSuccessors is the set of successors of the node (Loop or Terminator) + // headed by @rootBlock. + // @parentLoop is the parent loop of the Loop or the loop that contains the + // Terminator. + template + std::unique_ptr + computeJoinPoints(const BasicBlock &RootBlock, + SuccessorIterable NodeSuccessors, const Loop *ParentLoop) { + assert(JoinBlocks); + + // immediate post dominator (no join block beyond that block) + const auto *PdNode = PDT.getNode(const_cast(&RootBlock)); + const auto *IpdNode = PdNode->getIDom(); + const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr; + + // bootstrap with branch targets + for (const auto *SuccBlock : NodeSuccessors) { + DefMap.emplace(SuccBlock, SuccBlock); + + if (ParentLoop && !ParentLoop->contains(SuccBlock)) { + // immediate loop exit from node. + ReachedLoopExits.insert(SuccBlock); + continue; + } else { + // regular successor + PendingUpdates.insert(SuccBlock); + } + } + + auto ItBeginRPO = FuncRPOT.begin(); + + // skip until term (TODO RPOT won't let us start at @term directly) + for (; *ItBeginRPO != &RootBlock; ++ItBeginRPO) { + } + + auto ItEndRPO = FuncRPOT.end(); + assert(ItBeginRPO != ItEndRPO); + + // propagate definitions at the immediate successors of the node in RPO + auto ItBlockRPO = ItBeginRPO; + while (++ItBlockRPO != ItEndRPO && *ItBlockRPO != PdBoundBlock) { + const auto *Block = *ItBlockRPO; + + // skip @block if not pending update + auto ItPending = PendingUpdates.find(Block); + if (ItPending == PendingUpdates.end()) + continue; + PendingUpdates.erase(ItPending); + + // propagate definition at @block to its successors + auto ItDef = DefMap.find(Block); + const auto *DefBlock = ItDef->second; + assert(DefBlock); + + auto *BlockLoop = LI.getLoopFor(Block); + if (ParentLoop && + (ParentLoop != BlockLoop && ParentLoop->contains(BlockLoop))) { + // if the successor is the header of a nested loop pretend its a + // single node with the loop's exits as successors + SmallVector BlockLoopExits; + BlockLoop->getExitBlocks(BlockLoopExits); + for (const auto *BlockLoopExit : BlockLoopExits) { + visitSuccessor(*BlockLoopExit, ParentLoop, *DefBlock); + } + + } else { + // the successors are either on the same loop level or loop exits + for (const auto *SuccBlock : successors(Block)) { + visitSuccessor(*SuccBlock, ParentLoop, *DefBlock); + } + } + } + + // We need to know the definition at the parent loop header to decide + // whether the definition at the header is different from the definition at + // the loop exits, which would indicate a divergent loop exits. + // + // A // loop header + // | + // B // nested loop header + // | + // C -> X (exit from B loop) -..-> (A latch) + // | + // D -> back to B (B latch) + // | + // proper exit from both loops + // + // D post-dominates B as it is the only proper exit from the "A loop". + // If C has a divergent branch, propagation will therefore stop at D. + // That implies that B will never receive a definition. + // But that definition can only be the same as at D (D itself in thise case) + // because all paths to anywhere have to pass through D. + // + const BasicBlock *ParentLoopHeader = + ParentLoop ? ParentLoop->getHeader() : nullptr; + if (ParentLoop && ParentLoop->contains(PdBoundBlock)) { + DefMap[ParentLoopHeader] = DefMap[PdBoundBlock]; + } + + // analyze reached loop exits + if (!ReachedLoopExits.empty()) { + assert(ParentLoop); + const auto *HeaderDefBlock = DefMap[ParentLoopHeader]; + LLVM_DEBUG(printDefs(dbgs())); + assert(HeaderDefBlock && "no definition in header of carrying loop"); + + for (const auto *ExitBlock : ReachedLoopExits) { + auto ItExitDef = DefMap.find(ExitBlock); + assert((ItExitDef != DefMap.end()) && + "no reaching def at reachable loop exit"); + if (ItExitDef->second != HeaderDefBlock) { + JoinBlocks->insert(ExitBlock); + } + } + } + + return std::move(JoinBlocks); + } +}; + +const ConstBlockSet &SyncDependenceAnalysis::join_blocks(const Loop &Loop) { + using LoopExitVec = SmallVector; + LoopExitVec LoopExits; + Loop.getExitBlocks(LoopExits); + if (LoopExits.size() < 1) { + return EmptyBlockSet; + } + + // already available in cache? + auto ItCached = CachedLoopExitJoins.find(&Loop); + if (ItCached != CachedLoopExitJoins.end()) + return *ItCached->second; + + // compute all join points + DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI}; + auto JoinBlocks = Propagator.computeJoinPoints( + *Loop.getHeader(), LoopExits, Loop.getParentLoop()); + + auto ItInserted = CachedLoopExitJoins.emplace(&Loop, std::move(JoinBlocks)); + assert(ItInserted.second); + return *ItInserted.first->second; +} + +const ConstBlockSet & +SyncDependenceAnalysis::join_blocks(const TerminatorInst &Term) { + // trivial case + if (Term.getNumSuccessors() < 1) { + return EmptyBlockSet; + } + + // already available in cache? + auto ItCached = CachedBranchJoins.find(&Term); + if (ItCached != CachedBranchJoins.end()) + return *ItCached->second; + + // compute all join points + DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI}; + const auto &TermBlock = *Term.getParent(); + auto JoinBlocks = Propagator.computeJoinPoints( + TermBlock, successors(Term.getParent()), LI.getLoopFor(&TermBlock)); + + auto ItInserted = CachedBranchJoins.emplace(&Term, std::move(JoinBlocks)); + assert(ItInserted.second); + return *ItInserted.first->second; +} + +} // namespace llvm Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -984,7 +984,7 @@ void SelectionDAG::init(MachineFunction &NewMF, OptimizationRemarkEmitter &NewORE, Pass *PassPtr, const TargetLibraryInfo *LibraryInfo, - DivergenceAnalysis * Divergence) { + LegacyDivergenceAnalysis * Divergence) { MF = &NewMF; SDAGISelPass = PassPtr; ORE = &NewORE; Index: lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -417,7 +417,7 @@ SplitCriticalSideEffectEdges(const_cast(Fn), DT, LI); CurDAG->init(*MF, *ORE, this, LibInfo, - getAnalysisIfAvailable()); + getAnalysisIfAvailable()); FuncInfo->set(Fn, *MF, CurDAG); // Now get the optional analyzes if we want to. Index: lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp +++ lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -16,7 +16,7 @@ #include "AMDGPU.h" #include "AMDGPUIntrinsicInfo.h" #include "llvm/ADT/SetVector.h" -#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/IR/IRBuilder.h" @@ -32,7 +32,7 @@ class AMDGPUAnnotateUniformValues : public FunctionPass, public InstVisitor { - DivergenceAnalysis *DA; + LegacyDivergenceAnalysis *DA; MemoryDependenceResults *MDR; LoopInfo *LI; DenseMap noClobberClones; @@ -49,7 +49,7 @@ return "AMDGPU Annotate Uniform Values"; } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.setPreservesAll(); @@ -64,7 +64,7 @@ INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE, "Add AMDGPU uniform metadata", false, false) -INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE, @@ -176,7 +176,7 @@ if (skipFunction(F)) return false; - DA = &getAnalysis(); + DA = &getAnalysis(); MDR = &getAnalysis().getMemDep(); LI = &getAnalysis().getLoopInfo(); isKernelFunc = F.getCallingConv() == CallingConv::AMDGPU_KERNEL; Index: lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -18,7 +18,7 @@ #include "AMDGPUTargetMachine.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AssumptionCache.h" -#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/Passes.h" @@ -60,7 +60,7 @@ public InstVisitor { const GCNSubtarget *ST = nullptr; AssumptionCache *AC = nullptr; - DivergenceAnalysis *DA = nullptr; + LegacyDivergenceAnalysis *DA = nullptr; Module *Mod = nullptr; bool HasUnsafeFPMath = false; AMDGPUAS AMDGPUASI; @@ -177,7 +177,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); - AU.addRequired(); + AU.addRequired(); AU.setPreservesAll(); } }; @@ -898,7 +898,7 @@ const AMDGPUTargetMachine &TM = TPC->getTM(); ST = &TM.getSubtarget(F); AC = &getAnalysis().getAssumptionCache(F); - DA = &getAnalysis(); + DA = &getAnalysis(); HasUnsafeFPMath = hasUnsafeFPMath(F); AMDGPUASI = TM.getAMDGPUAS(); @@ -918,7 +918,7 @@ INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", false, false) Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -29,7 +29,7 @@ #include "llvm/ADT/APInt.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" -#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/ISDOpcodes.h" @@ -87,7 +87,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); - AU.addRequired(); + AU.addRequired(); SelectionDAGISel::getAnalysisUsage(AU); } @@ -253,7 +253,7 @@ "AMDGPU DAG->DAG Pattern Instruction Selection", false, false) INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo) INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis) -INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "isel", "AMDGPU DAG->DAG Pattern Instruction Selection", false, false) Index: lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -16,7 +16,6 @@ #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" #include "llvm/ADT/StringRef.h" -#include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/Analysis/Loads.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" Index: lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -25,7 +25,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" -#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Transforms/Utils/Local.h" @@ -70,7 +70,7 @@ INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE, "Unify divergent function exit nodes", false, false) INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE, "Unify divergent function exit nodes", false, false) @@ -78,10 +78,10 @@ // TODO: Preserve dominator tree. AU.addRequired(); - AU.addRequired(); + AU.addRequired(); // No divergent values are changed, only blocks and branch edges. - AU.addPreserved(); + AU.addPreserved(); // We preserve the non-critical-edgeness property AU.addPreservedID(BreakCriticalEdgesID); @@ -95,7 +95,7 @@ /// \returns true if \p BB is reachable through only uniform branches. /// XXX - Is there a more efficient way to find this? -static bool isUniformlyReached(const DivergenceAnalysis &DA, +static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA, BasicBlock &BB) { SmallVector Stack; SmallPtrSet Visited; @@ -163,7 +163,7 @@ if (PDT.getRoots().size() <= 1) return false; - DivergenceAnalysis &DA = getAnalysis(); + LegacyDivergenceAnalysis &DA = getAnalysis(); // Loop over all of the blocks in a function, tracking all of the blocks that // return. Index: lib/Target/AMDGPU/SIAnnotateControlFlow.cpp =================================================================== --- lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -16,7 +16,7 @@ #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/BasicBlock.h" @@ -52,7 +52,7 @@ using StackVector = SmallVector; class SIAnnotateControlFlow : public FunctionPass { - DivergenceAnalysis *DA; + LegacyDivergenceAnalysis *DA; Type *Boolean; Type *Void; @@ -116,7 +116,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); - AU.addRequired(); + AU.addRequired(); AU.addPreserved(); FunctionPass::getAnalysisUsage(AU); } @@ -127,7 +127,7 @@ INITIALIZE_PASS_BEGIN(SIAnnotateControlFlow, DEBUG_TYPE, "Annotate SI Control Flow", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) INITIALIZE_PASS_END(SIAnnotateControlFlow, DEBUG_TYPE, "Annotate SI Control Flow", false, false) @@ -387,7 +387,7 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) { DT = &getAnalysis().getDomTree(); LI = &getAnalysis().getLoopInfo(); - DA = &getAnalysis(); + DA = &getAnalysis(); for (df_iterator I = df_begin(&F.getEntryBlock()), E = df_end(&F.getEntryBlock()); I != E; ++I) { Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -339,7 +339,7 @@ unsigned Depth = 0) const override; bool isSDNodeSourceOfDivergence(const SDNode *N, - FunctionLoweringInfo *FLI, DivergenceAnalysis *DA) const override; + FunctionLoweringInfo *FLI, LegacyDivergenceAnalysis *DA) const override; bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth = 5) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -9106,7 +9106,7 @@ } bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N, - FunctionLoweringInfo * FLI, DivergenceAnalysis * DA) const + FunctionLoweringInfo * FLI, LegacyDivergenceAnalysis * KDA) const { switch (N->getOpcode()) { case ISD::Register: @@ -9139,7 +9139,7 @@ else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv())) return true; } - return !DA || DA->isDivergent(FLI->getValueFromVirtualReg(Reg)); + return !KDA || KDA->isDivergent(FLI->getValueFromVirtualReg(Reg)); } } break; Index: lib/Transforms/Scalar/LoopUnswitch.cpp =================================================================== --- lib/Transforms/Scalar/LoopUnswitch.cpp +++ lib/Transforms/Scalar/LoopUnswitch.cpp @@ -33,7 +33,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" -#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" @@ -215,7 +215,7 @@ AU.addRequired(); AU.addRequired(); if (hasBranchDivergence) - AU.addRequired(); + AU.addRequired(); getLoopAnalysisUsage(AU); } @@ -383,7 +383,7 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(LoopPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) INITIALIZE_PASS_END(LoopUnswitch, "loop-unswitch", "Unswitch loops", false, false) @@ -864,7 +864,7 @@ return false; } if (hasBranchDivergence && - getAnalysis().isDivergent(LoopCond)) { + getAnalysis().isDivergent(LoopCond)) { LLVM_DEBUG(dbgs() << "NOT unswitching loop %" << currentLoop->getHeader()->getName() << " at non-trivial condition '" << *Val Index: lib/Transforms/Scalar/StructurizeCFG.cpp =================================================================== --- lib/Transforms/Scalar/StructurizeCFG.cpp +++ lib/Transforms/Scalar/StructurizeCFG.cpp @@ -13,7 +13,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/RegionInfo.h" #include "llvm/Analysis/RegionIterator.h" @@ -183,7 +183,7 @@ Function *Func; Region *ParentRegion; - DivergenceAnalysis *DA; + LegacyDivergenceAnalysis *DA; DominatorTree *DT; LoopInfo *LI; @@ -269,7 +269,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { if (SkipUniformRegions) - AU.addRequired(); + AU.addRequired(); AU.addRequiredID(LowerSwitchID); AU.addRequired(); AU.addRequired(); @@ -285,7 +285,7 @@ INITIALIZE_PASS_BEGIN(StructurizeCFG, "structurizecfg", "Structurize the CFG", false, false) -INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) INITIALIZE_PASS_DEPENDENCY(LowerSwitch) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(RegionInfoPass) @@ -914,7 +914,7 @@ } static bool hasOnlyUniformBranches(Region *R, unsigned UniformMDKindID, - const DivergenceAnalysis &DA) { + const LegacyDivergenceAnalysis &DA) { for (auto E : R->elements()) { if (!E->isSubRegion()) { auto Br = dyn_cast(E->getEntry()->getTerminator()); @@ -962,7 +962,7 @@ // but we shouldn't rely on metadata for correctness! unsigned UniformMDKindID = R->getEntry()->getContext().getMDKindID("structurizecfg.uniform"); - DA = &getAnalysis(); + DA = &getAnalysis(); if (hasOnlyUniformBranches(R, UniformMDKindID, *DA)) { LLVM_DEBUG(dbgs() << "Skipping region with uniform control flow: " << *R Index: test/Analysis/DivergenceAnalysis/AMDGPU/always_uniform.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/always_uniform.ll @@ -0,0 +1,14 @@ +; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +define amdgpu_kernel void @workitem_id_x() #1 { + %id.x = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK: DIVERGENT: %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %first.lane = call i32 @llvm.amdgcn.readfirstlane(i32 %id.x) +; CHECK-NOT: DIVERGENT: %first.lane = call i32 @llvm.amdgcn.readfirstlane(i32 %id.x) + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 +declare i32 @llvm.amdgcn.readfirstlane(i32) #0 + +attributes #0 = { nounwind readnone } Index: test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll =================================================================== --- test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll +++ test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll @@ -1,4 +1,4 @@ -; RUN: opt -mtriple=amdgcn-- -analyze -divergence %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s ; CHECK: DIVERGENT: %orig = atomicrmw xchg i32* %ptr, i32 %val seq_cst define i32 @test1(i32* %ptr, i32 %val) #0 { Index: test/Analysis/DivergenceAnalysis/AMDGPU/hidden_diverge.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/hidden_diverge.ll @@ -0,0 +1,26 @@ +; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +define amdgpu_kernel void @hidden_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_diverge' +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %cond.var = icmp slt i32 %tid, 0 + br i1 %cond.var, label %B, label %C ; divergent +; CHECK: DIVERGENT: br i1 %cond.var, +B: + %cond.uni = icmp slt i32 %n, 0 + br i1 %cond.uni, label %C, label %merge ; uniform +; CHECK-NOT: DIVERGENT: br i1 %cond.uni, +C: + %phi.var.hidden = phi i32 [ 1, %entry ], [ 2, %B ] +; CHECK: DIVERGENT: %phi.var.hidden = phi i32 + br label %merge +merge: + %phi.ipd = phi i32 [ %a, %B ], [ %b, %C ] +; CHECK: DIVERGENT: %phi.ipd = phi i32 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } Index: test/Analysis/DivergenceAnalysis/AMDGPU/hidden_loopdiverge.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/hidden_loopdiverge.ll @@ -0,0 +1,223 @@ +; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +; divergent loop (H
, B) +; the divergent join point in %exit is obscured by uniform control joining in %X +define amdgpu_kernel void @hidden_loop_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_loop_diverge': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br i1 %uni.cond, label %X, label %H ; uniform + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %B ] + %div.exitx = icmp slt i32 %tid, 0 + br i1 %div.exitx, label %X, label %B ; divergent branch +; CHECK: DIVERGENT: %div.exitx = +; CHECK: DIVERGENT: br i1 %div.exitx, + +B: + %uni.inc = add i32 %uni.merge.h, 1 + %div.exity = icmp sgt i32 %tid, 0 + br i1 %div.exity, label %Y, label %H ; divergent branch +; CHECK: DIVERGENT: %div.exity = +; CHECK: DIVERGENT: br i1 %div.exity, + +X: + %div.merge.x = phi i32 [ %a, %entry ], [ %uni.merge.h, %H ] ; temporal divergent phi + br i1 %uni.cond, label %Y, label %exit +; CHECK: DIVERGENT: %div.merge.x = + +Y: + %div.merge.y = phi i32 [ 42, %X ], [ %b, %B ] + br label %exit +; CHECK: DIVERGENT: %div.merge.y = + +exit: + %div.merge.exit = phi i32 [ %a, %X ], [ %b, %Y ] + ret void +; CHECK: DIVERGENT: %div.merge.exit = +} + +; divergent loop (H
, B) +; the phi nodes in X and Y don't actually receive divergent values +define amdgpu_kernel void @unobserved_loop_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'unobserved_loop_diverge': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br i1 %uni.cond, label %X, label %H ; uniform + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %B ] + %div.exitx = icmp slt i32 %tid, 0 + br i1 %div.exitx, label %X, label %B ; divergent branch +; CHECK: DIVERGENT: %div.exitx = +; CHECK: DIVERGENT: br i1 %div.exitx, + +B: + %uni.inc = add i32 %uni.merge.h, 1 + %div.exity = icmp sgt i32 %tid, 0 + br i1 %div.exity, label %Y, label %H ; divergent branch +; CHECK: DIVERGENT: %div.exity = +; CHECK: DIVERGENT: br i1 %div.exity, + +X: + %uni.merge.x = phi i32 [ %a, %entry ], [ %b, %H ] + br label %exit + +Y: + %uni.merge.y = phi i32 [ %b, %B ] + br label %exit + +exit: + %div.merge.exit = phi i32 [ %a, %X ], [ %b, %Y ] + ret void +; CHECK: DIVERGENT: %div.merge.exit = +} + +; divergent loop (G
, L) inside divergent loop (H
, B, C, D, G, L) +; the inner loop has no exit to top level. +; the outer loop becomes divergent as its exiting branch in C is control-dependent on the inner loop's divergent loop exit in D. +define amdgpu_kernel void @hidden_nestedloop_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_nestedloop_diverge': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %uni.cond, label %X, label %H + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %D ] + br i1 %uni.cond, label %G, label %B +; CHECK: DIVERGENT: %div.exitx = + +B: + br i1 %uni.cond, label %X, label %C + +C: + br i1 %uni.cond, label %Y, label %D + +D: + %uni.inc = add i32 %uni.merge.h, 1 + br label %H + +G: + br i1 %div.exitx, label %C, label %L +; CHECK: DIVERGENT: br i1 %div.exitx, + +L: + br i1 %uni.cond, label %D, label %G + +X: + %div.merge.x = phi i32 [ %a, %entry ], [ %uni.merge.h, %B ] ; temporal divergent phi + br i1 %uni.cond, label %Y, label %exit +; CHECK: DIVERGENT: %div.merge.x = + +Y: + %div.merge.y = phi i32 [ 42, %X ], [ %b, %C ] + br label %exit +; CHECK: DIVERGENT: %div.merge.y = + +exit: + %div.merge.exit = phi i32 [ %a, %X ], [ %b, %Y ] + ret void +; CHECK: DIVERGENT: %div.merge.exit = +} + +; divergent loop (G
, L) in divergent loop (H
, B, C, G, L) +; the outer loop has no immediately divergent exiting edge. +; the inner exiting edge is exiting to top-level through the outer loop causing both to become divergent. +define amdgpu_kernel void @hidden_doublebreak_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_doublebreak_diverge': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %uni.cond, label %X, label %H + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %C ] + br i1 %uni.cond, label %G, label %B +; CHECK: DIVERGENT: %div.exitx = + +B: + br i1 %uni.cond, label %Y, label %C + +C: + %uni.inc = add i32 %uni.merge.h, 1 + br label %H + +G: + br i1 %div.exitx, label %X, label %L ; two-level break +; CHECK: DIVERGENT: br i1 %div.exitx, + +L: + br i1 %uni.cond, label %C, label %G + +X: + %div.merge.x = phi i32 [ %a, %entry ], [ %uni.merge.h, %G ] ; temporal divergence + br label %Y +; CHECK: DIVERGENT: %div.merge.x = + +Y: + %div.merge.y = phi i32 [ 42, %X ], [ %b, %B ] + ret void +; CHECK: DIVERGENT: %div.merge.y = +} + +; divergent loop (G
, L) contained inside a uniform loop (H
, B, G, L , D) +define amdgpu_kernel void @hidden_containedloop_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_containedloop_diverge': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %uni.cond, label %X, label %H + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc.d, %D ] + br i1 %uni.cond, label %G, label %B +; CHECK: DIVERGENT: %div.exitx = + +B: + %div.merge.b = phi i32 [ 42, %H ], [ %uni.merge.g, %G ] + br label %D +; CHECK: DIVERGENT: %div.merge.b = + +G: + %uni.merge.g = phi i32 [ 123, %H ], [ %uni.inc.l, %L ] + br i1 %div.exitx, label %B, label %L +; CHECK: DIVERGENT: br i1 %div.exitx, + +L: + %uni.inc.l = add i32 %uni.merge.g, 1 + br i1 %uni.cond, label %G, label %D + +D: + %uni.inc.d = add i32 %uni.merge.h, 1 + br i1 %uni.cond, label %X, label %H + +X: + %uni.merge.x = phi i32 [ %a, %entry ], [ %uni.inc.d, %D ] + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } Index: test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll =================================================================== --- test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll +++ test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll @@ -1,4 +1,4 @@ -; RUN: opt -mtriple=amdgcn-- -analyze -divergence %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s ; CHECK: DIVERGENT: %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0 define amdgpu_kernel void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) #0 { Index: test/Analysis/DivergenceAnalysis/AMDGPU/irreducible.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/irreducible.ll @@ -0,0 +1,48 @@ +; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +; This test contains an unstructured loop. +; +-------------- entry ----------------+ +; | | +; V V +; i1 = phi(0, i3) i2 = phi(0, i3) +; j1 = i1 + 1 ---> i3 = phi(j1, j2) <--- j2 = i2 + 2 +; ^ | ^ +; | V | +; +-------- switch (tid / i3) ----------+ +; | +; V +; if (i3 == 5) // divergent +; because sync dependent on (tid / i3). +define i32 @unstructured_loop(i1 %entry_cond) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'unstructured_loop' +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + br i1 %entry_cond, label %loop_entry_1, label %loop_entry_2 +loop_entry_1: + %i1 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ] + %j1 = add i32 %i1, 1 + br label %loop_body +loop_entry_2: + %i2 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ] + %j2 = add i32 %i2, 2 + br label %loop_body +loop_body: + %i3 = phi i32 [ %j1, %loop_entry_1 ], [ %j2, %loop_entry_2 ] + br label %loop_latch +loop_latch: + %div = sdiv i32 %tid, %i3 + switch i32 %div, label %branch [ i32 1, label %loop_entry_1 + i32 2, label %loop_entry_2 ] +branch: + %cmp = icmp eq i32 %i3, 5 + br i1 %cmp, label %then, label %else +; CHECK: DIVERGENT: br i1 %cmp, +then: + ret i32 0 +else: + ret i32 1 +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } Index: test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll =================================================================== --- test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll +++ test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll @@ -1,6 +1,6 @@ -; RUN: opt %s -mtriple amdgcn-- -analyze -divergence | FileCheck %s +; RUN: opt %s -mtriple amdgcn-- -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'test_amdgpu_ps': +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'test_amdgpu_ps': ; CHECK: DIVERGENT: ; CHECK-NOT: %arg0 ; CHECK-NOT: %arg1 @@ -14,7 +14,7 @@ ret void } -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'test_amdgpu_kernel': +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'test_amdgpu_kernel': ; CHECK-NOT: %arg0 ; CHECK-NOT: %arg1 ; CHECK-NOT: %arg2 @@ -26,7 +26,7 @@ ret void } -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'test_c': +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'test_c': ; CHECK: DIVERGENT: ; CHECK: DIVERGENT: ; CHECK: DIVERGENT: Index: test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll =================================================================== --- test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll +++ test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll @@ -1,4 +1,4 @@ -;RUN: opt -mtriple=amdgcn-mesa-mesa3d -analyze -divergence %s | FileCheck %s +;RUN: opt -mtriple=amdgcn-mesa-mesa3d -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.swap( define float @buffer_atomic_swap(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { Index: test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll =================================================================== --- test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll +++ test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll @@ -1,4 +1,4 @@ -;RUN: opt -mtriple=amdgcn-mesa-mesa3d -analyze -divergence %s | FileCheck %s +;RUN: opt -mtriple=amdgcn-mesa-mesa3d -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32( define float @image_atomic_swap(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { Index: test/Analysis/DivergenceAnalysis/AMDGPU/loads.ll =================================================================== --- test/Analysis/DivergenceAnalysis/AMDGPU/loads.ll +++ test/Analysis/DivergenceAnalysis/AMDGPU/loads.ll @@ -1,4 +1,4 @@ -; RUN: opt -mtriple=amdgcn-- -analyze -divergence %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s ; Test that we consider loads from flat and private addrspaces to be divergent. Index: test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll =================================================================== --- test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll +++ test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll @@ -1,4 +1,4 @@ -; RUN: opt %s -mtriple amdgcn-- -analyze -divergence | FileCheck %s +; RUN: opt %s -mtriple amdgcn-- -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s ; CHECK: DIVERGENT: %tmp5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp2 ; CHECK: DIVERGENT: %tmp10 = load volatile float, float addrspace(1)* %tmp5, align 4 Index: test/Analysis/DivergenceAnalysis/AMDGPU/phi-undef.ll =================================================================== --- test/Analysis/DivergenceAnalysis/AMDGPU/phi-undef.ll +++ test/Analysis/DivergenceAnalysis/AMDGPU/phi-undef.ll @@ -1,4 +1,4 @@ -; RUN: opt -mtriple=amdgcn-- -analyze -divergence %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s ; CHECK-LABEL: 'test1': ; CHECK-NEXT: DIVERGENT: i32 %bound Index: test/Analysis/DivergenceAnalysis/AMDGPU/temporal_diverge.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/temporal_diverge.ll @@ -0,0 +1,154 @@ +; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +; temporal-divergent use of value carried by divergent loop +define amdgpu_kernel void @temporal_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'temporal_diverge': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br label %H + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %H ] + %uni.inc = add i32 %uni.merge.h, 1 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %div.exitx, label %X, label %H ; divergent branch +; CHECK: DIVERGENT: %div.exitx = +; CHECK: DIVERGENT: br i1 %div.exitx, + +X: + %div.user = add i32 %uni.inc, 5 + ret void +} + +; temporal-divergent use of value carried by divergent loop inside a top-level loop +define amdgpu_kernel void @temporal_diverge_inloop(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'temporal_diverge_inloop': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br label %G + +G: + br label %H + +H: + %uni.merge.h = phi i32 [ 0, %G ], [ %uni.inc, %H ] + %uni.inc = add i32 %uni.merge.h, 1 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %div.exitx, label %X, label %H ; divergent branch +; CHECK: DIVERGENT: %div.exitx = +; CHECK: DIVERGENT: br i1 %div.exitx, + +X: + %div.user = add i32 %uni.inc, 5 + br i1 %uni.cond, label %G, label %Y + +Y: + %div.alsouser = add i32 %uni.inc, 5 + ret void +} + + +; temporal-uniform use of a valud, definition and users are carried by a surrounding divergent loop +define amdgpu_kernel void @temporal_uniform_indivloop(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'temporal_uniform_indivloop': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br label %G + +G: + br label %H + +H: + %uni.merge.h = phi i32 [ 0, %G ], [ %uni.inc, %H ] + %uni.inc = add i32 %uni.merge.h, 1 + br i1 %uni.cond, label %X, label %H ; divergent branch + +X: + %uni.user = add i32 %uni.inc, 5 + %div.exity = icmp slt i32 %tid, 0 +; CHECK: DIVERGENT: %div.exity = + br i1 %div.exity, label %G, label %Y +; CHECK: DIVERGENT: br i1 %div.exity, + +Y: + %div.alsouser = add i32 %uni.inc, 5 + ret void +} + + +; temporal-divergent use of value carried by divergent loop, user is inside sibling loop +define amdgpu_kernel void @temporal_diverge_loopuser(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'temporal_diverge_loopuser': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br label %H + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %H ] + %uni.inc = add i32 %uni.merge.h, 1 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %div.exitx, label %X, label %H ; divergent branch +; CHECK: DIVERGENT: %div.exitx = +; CHECK: DIVERGENT: br i1 %div.exitx, + +X: + br label %G + +G: + %div.user = add i32 %uni.inc, 5 + br i1 %uni.cond, label %G, label %Y + +Y: + ret void +} + +; temporal-divergent use of value carried by divergent loop, user is inside sibling loop, defs and use are carried by a uniform loop +define amdgpu_kernel void @temporal_diverge_loopuser_nested(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'temporal_diverge_loopuser_nested': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br label %H + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %H ] + %uni.inc = add i32 %uni.merge.h, 1 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %div.exitx, label %X, label %H ; divergent branch +; CHECK: DIVERGENT: %div.exitx = +; CHECK: DIVERGENT: br i1 %div.exitx, + +X: + br label %G + +G: + %div.user = add i32 %uni.inc, 5 + br i1 %uni.cond, label %G, label %Y + +Y: + ret void +} + + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } Index: test/Analysis/DivergenceAnalysis/AMDGPU/unreachable-loop-block.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/unreachable-loop-block.ll @@ -1,17 +0,0 @@ -; RUN: opt %s -mtriple amdgcn-- -analyze -divergence | FileCheck %s - -; CHECK: DIVERGENT: %tmp = cmpxchg volatile -define amdgpu_kernel void @unreachable_loop(i32 %tidx) #0 { -entry: - unreachable - -unreachable_loop: ; preds = %do.body.i, %if.then11 - %tmp = cmpxchg volatile i32 addrspace(1)* null, i32 0, i32 0 seq_cst seq_cst - %cmp.i = extractvalue { i32, i1 } %tmp, 1 - br i1 %cmp.i, label %unreachable_loop, label %end - -end: ; preds = %do.body.i51, %atomicAdd_g_f.exit - unreachable -} - -attributes #0 = { norecurse nounwind } Index: test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll =================================================================== --- test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll +++ test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll @@ -1,4 +1,4 @@ -; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence %s | FileCheck %s +; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s declare i32 @llvm.amdgcn.workitem.id.x() #0 declare i32 @llvm.amdgcn.workitem.id.y() #0 Index: test/Analysis/DivergenceAnalysis/Loops/IndirectUniAccess.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/Loops/IndirectUniAccess.ll @@ -0,0 +1,74 @@ +; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s + +; CHECK: Divergence of loop for.body { +; CHECK-NEXT: DIVERGENT: %indvars.iv29 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next30, %for.cond.cleanup3 ] +; CHECK-NEXT: DIVERGENT: %x.0.lcssa = phi double [ 0.000000e+00, %for.body ], [ %add, %for.body4 ] +; CHECK-NEXT: DIVERGENT: %arrayidx10 = getelementptr inbounds double, double* %C, i64 %indvars.iv29 +; CHECK-NEXT: DIVERGENT: store double %x.0.lcssa, double* %arrayidx10, align 8 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next30 = add nuw nsw i64 %indvars.iv29, 1 +; CHECK-NEXT: DIVERGENT: %x.025 = phi double [ %add, %for.body4 ], [ 0.000000e+00, %for.body4.preheader ] +; CHECK-NEXT: DIVERGENT: %arrayidx8 = getelementptr inbounds double, double* %1, i64 %indvars.iv29 +; CHECK-NEXT: DIVERGENT: %2 = load double, double* %arrayidx8, align 8 +; CHECK-NEXT: DIVERGENT: %add = fadd double %x.025, %2 +; CHECK-NEXT: } +; CHECK: Divergence of loop for.body4 { +; CHECK-NEXT: DIVERGENT: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4 ], [ 0, %for.body4.preheader ] +; CHECK-NEXT: DIVERGENT: %x.025 = phi double [ %add, %for.body4 ], [ 0.000000e+00, %for.body4.preheader ] +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds i32, i32* %Index, i64 %indvars.iv +; CHECK-NEXT: DIVERGENT: %0 = load i32, i32* %arrayidx, align 4 +; CHECK-NEXT: DIVERGENT: %idxprom5 = sext i32 %0 to i64 +; CHECK-NEXT: DIVERGENT: %arrayidx6 = getelementptr inbounds double*, double** %A, i64 %idxprom5 +; CHECK-NEXT: DIVERGENT: %1 = load double*, double** %arrayidx6, align 8 +; CHECK-NEXT: DIVERGENT: %arrayidx8 = getelementptr inbounds double, double* %1, i64 %indvars.iv29 +; CHECK-NEXT: DIVERGENT: %2 = load double, double* %arrayidx8, align 8 +; CHECK-NEXT: DIVERGENT: %add = fadd double %x.025, %2 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-NEXT: } + +; Function Attrs: norecurse nounwind uwtable +define void @test(i32* nocapture readonly %Index, double** nocapture readonly %A, double* nocapture %C, i32 %m, i32 %n) #0 { +entry: + %cmp27 = icmp sgt i32 %n, 0 + br i1 %cmp27, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %cmp224 = icmp sgt i32 %m, 0 + %wide.trip.count = zext i32 %m to i64 + %wide.trip.count31 = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup3, %entry + ret void + +for.body: ; preds = %for.cond.cleanup3, %for.body.lr.ph + %indvars.iv29 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next30, %for.cond.cleanup3 ] + br i1 %cmp224, label %for.body4.preheader, label %for.cond.cleanup3 + +for.body4.preheader: ; preds = %for.body + br label %for.body4 + +for.cond.cleanup3: ; preds = %for.body4, %for.body + %x.0.lcssa = phi double [ 0.000000e+00, %for.body ], [ %add, %for.body4 ] + %arrayidx10 = getelementptr inbounds double, double* %C, i64 %indvars.iv29 + store double %x.0.lcssa, double* %arrayidx10, align 8 + %indvars.iv.next30 = add nuw nsw i64 %indvars.iv29, 1 + %exitcond32 = icmp eq i64 %indvars.iv.next30, %wide.trip.count31 + br i1 %exitcond32, label %for.cond.cleanup, label %for.body + +for.body4: ; preds = %for.body4.preheader, %for.body4 + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4 ], [ 0, %for.body4.preheader ] + %x.025 = phi double [ %add, %for.body4 ], [ 0.000000e+00, %for.body4.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %Index, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %idxprom5 = sext i32 %0 to i64 + %arrayidx6 = getelementptr inbounds double*, double** %A, i64 %idxprom5 + %1 = load double*, double** %arrayidx6, align 8 + %arrayidx8 = getelementptr inbounds double, double* %1, i64 %indvars.iv29 + %2 = load double, double* %arrayidx8, align 8 + %add = fadd double %x.025, %2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup3, label %for.body4 +} + +attributes #0 = { norecurse nounwind } Index: test/Analysis/DivergenceAnalysis/Loops/LoopWithDivBranch.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/Loops/LoopWithDivBranch.ll @@ -0,0 +1,44 @@ +; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s + +; CHECK: Divergence of loop for.body { +; CHECK-NEXT: DIVERGENT: %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %B ] +; CHECK-NEXT: DIVERGENT: %hfreq = srem i64 %indvars.iv, 2 +; CHECK-NEXT: DIVERGENT: %toggle = trunc i64 %hfreq to i1 +; CHECK-NEXT: DIVERGENT: br i1 %toggle, label %A, label %B +; CHECK-NEXT: DIVERGENT: %divphi = phi float [ %cast, %A ], [ 4.200000e+01, %for.body ] +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds float, float* %ptr, i64 %indvars.iv +; CHECK-NEXT: DIVERGENT: store float %divphi, float* %arrayidx, align 4 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-NEXT: } +define void @test1(float* nocapture %ptr, i64 %n) #0 { + entry: + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %B ] + %hfreq = srem i64 %indvars.iv, 2 + %toggle = trunc i64 %hfreq to i1 + br i1 %toggle, label %A, label %B + +A: + %trunc = trunc i64 %n to i32 + %cast = sitofp i32 %trunc to float + br label %B + +B: + %divphi = phi float [ %cast, %A ], [ 4.200000e+01, %for.body ] + %arrayidx = getelementptr inbounds float, float* %ptr, i64 %indvars.iv + store float %divphi, float* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +attributes #0 = { nounwind } Index: test/Analysis/DivergenceAnalysis/Loops/LoopWithDivLoop.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/Loops/LoopWithDivLoop.ll @@ -0,0 +1,60 @@ +; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s + +; CHECK: Printing analysis 'Loop Divergence Printer' for function 'test1': +; CHECK-NEXT: Divergence of loop for.body { +; CHECK-NEXT: DIVERGENT: %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.latch ] +; CHECK-NEXT: DIVERGENT: %row = mul i64 %n, %indvars.iv +; CHECK-NEXT: DIVERGENT: %idx = add i64 %row, %indvars.iv2 +; CHECK-NEXT: DIVERGENT: %trunc = trunc i64 %idx to i32 +; CHECK-NEXT: DIVERGENT: %val = sitofp i32 %trunc to float +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds float, float* %ptr, i64 %idx +; CHECK-NEXT: DIVERGENT: store float %val, float* %arrayidx, align 4 +; CHECK-NEXT: DIVERGENT: %exitcond2 = icmp sge i64 %indvars.iv.next2, %indvars.iv +; CHECK-NEXT: DIVERGENT: br i1 %exitcond2, label %for.latch, label %for.body2 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-NEXT: } +; CHECK: Divergence of loop for.body2 { +; CHECK-NEXT: DIVERGENT: %indvars.iv2 = phi i64 [ 0, %for.body ], [ %indvars.iv.next2, %for.body2 ] +; CHECK-NEXT: DIVERGENT: %idx = add i64 %row, %indvars.iv2 +; CHECK-NEXT: DIVERGENT: %trunc = trunc i64 %idx to i32 +; CHECK-NEXT: DIVERGENT: %val = sitofp i32 %trunc to float +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds float, float* %ptr, i64 %idx +; CHECK-NEXT: DIVERGENT: store float %val, float* %arrayidx, align 4 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next2 = add nuw nsw i64 %indvars.iv2, 1 +; CHECK-NEXT: } + +define void @test1(float* nocapture %ptr, i64 %n) #0 { + entry: + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body.lr.ph, label %exit + +for.body.lr.ph: ; preds = %entry + br label %for.body + +exit: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.latch ] + br label %for.body2 + +for.body2: + %indvars.iv2 = phi i64 [ 0, %for.body ], [ %indvars.iv.next2, %for.body2 ] + %row = mul i64 %n, %indvars.iv + %idx = add i64 %row, %indvars.iv2 + %trunc = trunc i64 %idx to i32 + %val = sitofp i32 %trunc to float + %arrayidx = getelementptr inbounds float, float* %ptr, i64 %idx + store float %val, float* %arrayidx, align 4 + %indvars.iv.next2 = add nuw nsw i64 %indvars.iv2, 1 + %exitcond2 = icmp sge i64 %indvars.iv.next2, %indvars.iv + br i1 %exitcond2, label %for.latch, label %for.body2 + +for.latch: + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond, label %exit, label %for.body +} + +attributes #0 = { nounwind } + Index: test/Analysis/DivergenceAnalysis/Loops/LoopWithLI.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/Loops/LoopWithLI.ll @@ -0,0 +1,31 @@ +; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s + +; CHECK: Divergence of loop for.body { +; CHECK-NEXT: DIVERGENT: %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv +; CHECK-NEXT: DIVERGENT: store float %cast, float* %arrayidx, align 4 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-NEXT: } +define void @test1(float* nocapture %A, i64 %n) #0 { + entry: + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv + %trunc = trunc i64 %n to i32 + %cast = sitofp i32 %trunc to float + store float %cast, float* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +attributes #0 = { nounwind } Index: test/Analysis/DivergenceAnalysis/Loops/LoopWithUniBranch.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/Loops/LoopWithUniBranch.ll @@ -0,0 +1,39 @@ +; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s + +; CHECK: Divergence of loop for.body { +; CHECK-NEXT: DIVERGENT: %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %B ] +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds float, float* %ptr, i64 %indvars.iv +; CHECK-NEXT: DIVERGENT: store float %divphi, float* %arrayidx, align 4 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-NEXT: } +define void @test1(float* nocapture %ptr, i64 %n) #0 { + entry: + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %B ] + %invar = trunc i64 %n to i1 + br i1 %invar, label %A, label %B + +A: + %trunc = trunc i64 %n to i32 + %cast = sitofp i32 %trunc to float + br label %B + +B: + %divphi = phi float [ %cast, %A ], [ 4.200000e+01, %for.body ] + %arrayidx = getelementptr inbounds float, float* %ptr, i64 %indvars.iv + store float %divphi, float* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +attributes #0 = { nounwind } Index: test/Analysis/DivergenceAnalysis/Loops/LoopWithUniLoop.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/Loops/LoopWithUniLoop.ll @@ -0,0 +1,55 @@ +; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s + +; CHECK: Divergence of loop for.body { +; CHECK-NEXT: DIVERGENT: %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.latch ] +; CHECK-NEXT: DIVERGENT: %row = mul i64 %n, %indvars.iv +; CHECK-NEXT: DIVERGENT: %idx = add i64 %row, %indvars.iv2 +; CHECK-NEXT: DIVERGENT: %trunc = trunc i64 %idx to i32 +; CHECK-NEXT: DIVERGENT: %val = sitofp i32 %trunc to float +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds float, float* %ptr, i64 %idx +; CHECK-NEXT: DIVERGENT: store float %val, float* %arrayidx, align 4 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-NEXT: } +; CHECK: Divergence of loop for.body2 { +; CHECK-NEXT: DIVERGENT: %indvars.iv2 = phi i64 [ 0, %for.body ], [ %indvars.iv.next2, %for.body2 ] +; CHECK-NEXT: DIVERGENT: %idx = add i64 %row, %indvars.iv2 +; CHECK-NEXT: DIVERGENT: %trunc = trunc i64 %idx to i32 +; CHECK-NEXT: DIVERGENT: %val = sitofp i32 %trunc to float +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds float, float* %ptr, i64 %idx +; CHECK-NEXT: DIVERGENT: store float %val, float* %arrayidx, align 4 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next2 = add nuw nsw i64 %indvars.iv2, 1 +; CHECK-NEXT: } +define void @test1(float* nocapture %ptr, i64 %n) #0 { + entry: + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body.lr.ph, label %exit + +for.body.lr.ph: ; preds = %entry + br label %for.body + +exit: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.latch ] + br label %for.body2 + +for.body2: + %indvars.iv2 = phi i64 [ 0, %for.body ], [ %indvars.iv.next2, %for.body2 ] + %row = mul i64 %n, %indvars.iv + %idx = add i64 %row, %indvars.iv2 + %trunc = trunc i64 %idx to i32 + %val = sitofp i32 %trunc to float + %arrayidx = getelementptr inbounds float, float* %ptr, i64 %idx + store float %val, float* %arrayidx, align 4 + %indvars.iv.next2 = add nuw nsw i64 %indvars.iv2, 1 + %exitcond2 = icmp eq i64 %indvars.iv.next2, %n + br i1 %exitcond2, label %for.latch, label %for.body2 + +for.latch: + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond, label %exit, label %for.body +} + +attributes #0 = { nounwind } Index: test/Analysis/DivergenceAnalysis/Loops/NonAffineUniLoop.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/Loops/NonAffineUniLoop.ll @@ -0,0 +1,110 @@ +; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s + +; CHECK: Divergence of loop for.body { +; CHECK-NEXT: DIVERGENT: %indvars.iv53 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next54, %for.cond.cleanup3 ] +; CHECK-NEXT: DIVERGENT: %indvars.iv.next54 = add nuw nsw i64 %indvars.iv53, 1 +; CHECK-NEXT: DIVERGENT: %5 = add nsw i64 %4, %indvars.iv53 +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds double, double* %A, i64 %5 +; CHECK-NEXT: DIVERGENT: %6 = load double, double* %arrayidx, align 8 +; CHECK-NEXT: DIVERGENT: %8 = add nsw i64 %7, %indvars.iv53 +; CHECK-NEXT: DIVERGENT: %arrayidx14 = getelementptr inbounds double, double* %A, i64 %8 +; CHECK-NEXT: DIVERGENT: %9 = load double, double* %arrayidx14, align 8 +; CHECK-NEXT: DIVERGENT: %add15 = fadd double %6, %9 +; CHECK-NEXT: DIVERGENT: store double %add15, double* %arrayidx14, align 8 +; CHECK-NEXT: } +; CHECK: Divergence of loop for.body8.lr.ph { +; CHECK-NEXT: DIVERGENT: %mul44 = phi i32 [ %mul, %for.cond.cleanup7 ], [ 2, %for.body8.lr.ph.preheader ] +; CHECK-NEXT: DIVERGENT: %len.043 = phi i32 [ %mul44, %for.cond.cleanup7 ], [ 1, %for.body8.lr.ph.preheader ] +; CHECK-NEXT: DIVERGENT: %1 = sext i32 %mul44 to i64 +; CHECK-NEXT: DIVERGENT: %2 = sext i32 %len.043 to i64 +; CHECK-NEXT: DIVERGENT: %mul = shl nsw i32 %mul44, 1 +; CHECK-NEXT: DIVERGENT: %indvars.iv = phi i64 [ 0, %for.body8.lr.ph ], [ %indvars.iv.next, %for.body8 ] +; CHECK-NEXT: DIVERGENT: %3 = add nsw i64 %indvars.iv, %2 +; CHECK-NEXT: DIVERGENT: %4 = mul nsw i64 %3, %0 +; CHECK-NEXT: DIVERGENT: %5 = add nsw i64 %4, %indvars.iv53 +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds double, double* %A, i64 %5 +; CHECK-NEXT: DIVERGENT: %6 = load double, double* %arrayidx, align 8 +; CHECK-NEXT: DIVERGENT: %7 = mul nsw i64 %indvars.iv, %0 +; CHECK-NEXT: DIVERGENT: %8 = add nsw i64 %7, %indvars.iv53 +; CHECK-NEXT: DIVERGENT: %arrayidx14 = getelementptr inbounds double, double* %A, i64 %8 +; CHECK-NEXT: DIVERGENT: %9 = load double, double* %arrayidx14, align 8 +; CHECK-NEXT: DIVERGENT: %add15 = fadd double %6, %9 +; CHECK-NEXT: DIVERGENT: store double %add15, double* %arrayidx14, align 8 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next = add i64 %indvars.iv, %1 +; CHECK-NEXT: DIVERGENT: %cmp6 = icmp slt i64 %indvars.iv.next, %0 +; CHECK-NEXT: DIVERGENT: br i1 %cmp6, label %for.body8, label %for.cond.cleanup7 +; CHECK-NEXT: } +; CHECK: Divergence of loop for.body8 { +; CHECK-NEXT: DIVERGENT: %indvars.iv = phi i64 [ 0, %for.body8.lr.ph ], [ %indvars.iv.next, %for.body8 ] +; CHECK-NEXT: DIVERGENT: %3 = add nsw i64 %indvars.iv, %2 +; CHECK-NEXT: DIVERGENT: %4 = mul nsw i64 %3, %0 +; CHECK-NEXT: DIVERGENT: %5 = add nsw i64 %4, %indvars.iv53 +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds double, double* %A, i64 %5 +; CHECK-NEXT: DIVERGENT: %6 = load double, double* %arrayidx, align 8 +; CHECK-NEXT: DIVERGENT: %7 = mul nsw i64 %indvars.iv, %0 +; CHECK-NEXT: DIVERGENT: %8 = add nsw i64 %7, %indvars.iv53 +; CHECK-NEXT: DIVERGENT: %arrayidx14 = getelementptr inbounds double, double* %A, i64 %8 +; CHECK-NEXT: DIVERGENT: %9 = load double, double* %arrayidx14, align 8 +; CHECK-NEXT: DIVERGENT: %add15 = fadd double %6, %9 +; CHECK-NEXT: DIVERGENT: store double %add15, double* %arrayidx14, align 8 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next = add i64 %indvars.iv, %1 +; CHECK-NEXT: } + +; Function Attrs: norecurse nounwind uwtable +define void @foo(double* nocapture %A, i32 %n) local_unnamed_addr #0 { +entry: + %cmp45 = icmp sgt i32 %n, 0 + br i1 %cmp45, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %cmp242 = icmp sgt i32 %n, 2 + %0 = sext i32 %n to i64 + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup3, %entry + ret void + +for.body: ; preds = %for.cond.cleanup3, %for.body.lr.ph + %indvars.iv53 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next54, %for.cond.cleanup3 ] + br i1 %cmp242, label %for.body8.lr.ph.preheader, label %for.cond.cleanup3 + +for.body8.lr.ph.preheader: ; preds = %for.body + br label %for.body8.lr.ph + +for.cond.cleanup3: ; preds = %for.cond.cleanup7, %for.body + %indvars.iv.next54 = add nuw nsw i64 %indvars.iv53, 1 + %exitcond = icmp eq i64 %indvars.iv.next54, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.body8.lr.ph: ; preds = %for.body8.lr.ph.preheader, %for.cond.cleanup7 + %mul44 = phi i32 [ %mul, %for.cond.cleanup7 ], [ 2, %for.body8.lr.ph.preheader ] + %len.043 = phi i32 [ %mul44, %for.cond.cleanup7 ], [ 1, %for.body8.lr.ph.preheader ] + %1 = sext i32 %mul44 to i64 + %2 = sext i32 %len.043 to i64 + br label %for.body8 + +for.cond.cleanup7: ; preds = %for.body8 + %mul = shl nsw i32 %mul44, 1 + %cmp2 = icmp slt i32 %mul, %n + br i1 %cmp2, label %for.body8.lr.ph, label %for.cond.cleanup3 + +for.body8: ; preds = %for.body8.lr.ph, %for.body8 + %indvars.iv = phi i64 [ 0, %for.body8.lr.ph ], [ %indvars.iv.next, %for.body8 ] + %3 = add nsw i64 %indvars.iv, %2 + %4 = mul nsw i64 %3, %0 + %5 = add nsw i64 %4, %indvars.iv53 + %arrayidx = getelementptr inbounds double, double* %A, i64 %5 + %6 = load double, double* %arrayidx, align 8 + %7 = mul nsw i64 %indvars.iv, %0 + %8 = add nsw i64 %7, %indvars.iv53 + %arrayidx14 = getelementptr inbounds double, double* %A, i64 %8 + %9 = load double, double* %arrayidx14, align 8 + %add15 = fadd double %6, %9 + store double %add15, double* %arrayidx14, align 8 + %indvars.iv.next = add i64 %indvars.iv, %1 + %cmp6 = icmp slt i64 %indvars.iv.next, %0 + br i1 %cmp6, label %for.body8, label %for.cond.cleanup7 +} + +attributes #0 = { norecurse nounwind uwtable } Index: test/Analysis/DivergenceAnalysis/Loops/SingleBlockLoop.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/Loops/SingleBlockLoop.ll @@ -0,0 +1,29 @@ +; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s + +; CHECK: Divergence of loop for.body { +; CHECK-NEXT: DIVERGENT: %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv +; CHECK-NEXT: DIVERGENT: store float 4.200000e+01, float* %arrayidx, align 4 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-NEXT: } +define void @test1(float* nocapture %A, i64 %n) #0 { + entry: + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv + store float 4.200000e+01, float* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +attributes #0 = { nounwind } Index: test/Analysis/DivergenceAnalysis/NVPTX/daorder.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/NVPTX/daorder.ll @@ -0,0 +1,47 @@ +; RUN: opt %s -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +define i32 @daorder(i32 %n) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'daorder' +entry: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %cond = icmp slt i32 %tid, 0 + br i1 %cond, label %A, label %B ; divergent +; CHECK: DIVERGENT: br i1 %cond, +A: + %defAtA = add i32 %n, 1 ; uniform +; CHECK-NOT: DIVERGENT: %defAtA = + br label %C +B: + %defAtB = add i32 %n, 2 ; uniform +; CHECK-NOT: DIVERGENT: %defAtB = + br label %C +C: + %defAtC = phi i32 [ %defAtA, %A ], [ %defAtB, %B ] ; divergent +; CHECK: DIVERGENT: %defAtC = + br label %D + +D: + %i = phi i32 [0, %C], [ %i.inc, %E ] ; uniform +; CHECK-NOT: DIVERGENT: %i = phi + br label %E + +E: + %i.inc = add i32 %i, 1 + %loopCnt = icmp slt i32 %i.inc, %n +; CHECK-NOT: DIVERGENT: %loopCnt = + br i1 %loopCnt, label %D, label %exit + +exit: + ret i32 %n +} + +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() +declare i32 @llvm.nvvm.read.ptx.sreg.laneid() + +!nvvm.annotations = !{!0} +!0 = !{i32 (i32)* @daorder, !"kernel", i32 1} Index: test/Analysis/DivergenceAnalysis/NVPTX/diverge.ll =================================================================== --- test/Analysis/DivergenceAnalysis/NVPTX/diverge.ll +++ test/Analysis/DivergenceAnalysis/NVPTX/diverge.ll @@ -1,11 +1,11 @@ -; RUN: opt %s -analyze -divergence | FileCheck %s +; RUN: opt %s -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" target triple = "nvptx64-nvidia-cuda" ; return (n < 0 ? a + threadIdx.x : b + threadIdx.x) define i32 @no_diverge(i32 %n, i32 %a, i32 %b) { -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'no_diverge' +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'no_diverge' entry: %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() %cond = icmp slt i32 %n, 0 @@ -27,7 +27,7 @@ ; c = b; ; return c; // c is divergent: sync dependent define i32 @sync(i32 %a, i32 %b) { -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'sync' +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'sync' bb1: %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.y() %cond = icmp slt i32 %tid, 5 @@ -48,7 +48,7 @@ ; // c here is divergent because it is sync dependent on threadIdx.x >= 5 ; return c; define i32 @mixed(i32 %n, i32 %a, i32 %b) { -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'mixed' +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'mixed' bb1: %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.z() %cond = icmp slt i32 %tid, 5 @@ -73,7 +73,7 @@ ; We conservatively treats all parameters of a __device__ function as divergent. define i32 @device(i32 %n, i32 %a, i32 %b) { -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'device' +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'device' ; CHECK: DIVERGENT: i32 %n ; CHECK: DIVERGENT: i32 %a ; CHECK: DIVERGENT: i32 %b @@ -98,7 +98,7 @@ ; ; The i defined in the loop is used outside. define i32 @loop() { -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'loop' +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'loop' entry: %laneid = call i32 @llvm.nvvm.read.ptx.sreg.laneid() br label %loop @@ -120,7 +120,7 @@ ; Same as @loop, but the loop is in the LCSSA form. define i32 @lcssa() { -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'lcssa' +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'lcssa' entry: %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() br label %loop @@ -142,49 +142,6 @@ ret i32 1 } -; This test contains an unstructured loop. -; +-------------- entry ----------------+ -; | | -; V V -; i1 = phi(0, i3) i2 = phi(0, i3) -; j1 = i1 + 1 ---> i3 = phi(j1, j2) <--- j2 = i2 + 2 -; ^ | ^ -; | V | -; +-------- switch (tid / i3) ----------+ -; | -; V -; if (i3 == 5) // divergent -; because sync dependent on (tid / i3). -define i32 @unstructured_loop(i1 %entry_cond) { -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'unstructured_loop' -entry: - %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - br i1 %entry_cond, label %loop_entry_1, label %loop_entry_2 -loop_entry_1: - %i1 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ] - %j1 = add i32 %i1, 1 - br label %loop_body -loop_entry_2: - %i2 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ] - %j2 = add i32 %i2, 2 - br label %loop_body -loop_body: - %i3 = phi i32 [ %j1, %loop_entry_1 ], [ %j2, %loop_entry_2 ] - br label %loop_latch -loop_latch: - %div = sdiv i32 %tid, %i3 - switch i32 %div, label %branch [ i32 1, label %loop_entry_1 - i32 2, label %loop_entry_2 ] -branch: - %cmp = icmp eq i32 %i3, 5 - br i1 %cmp, label %then, label %else -; CHECK: DIVERGENT: br i1 %cmp, -then: - ret i32 0 -else: - ret i32 1 -} - ; Verifies sync-dependence is computed correctly in the absense of loops. define i32 @sync_no_loop(i32 %arg) { entry: @@ -210,10 +167,9 @@ declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() declare i32 @llvm.nvvm.read.ptx.sreg.laneid() -!nvvm.annotations = !{!0, !1, !2, !3, !4, !5} +!nvvm.annotations = !{!0, !1, !2, !3, !4} !0 = !{i32 (i32, i32, i32)* @no_diverge, !"kernel", i32 1} !1 = !{i32 (i32, i32)* @sync, !"kernel", i32 1} !2 = !{i32 (i32, i32, i32)* @mixed, !"kernel", i32 1} !3 = !{i32 ()* @loop, !"kernel", i32 1} -!4 = !{i32 (i1)* @unstructured_loop, !"kernel", i32 1} -!5 = !{i32 (i32)* @sync_no_loop, !"kernel", i32 1} +!4 = !{i32 (i32)* @sync_no_loop, !"kernel", i32 1} Index: test/Analysis/DivergenceAnalysis/NVPTX/hidden_diverge.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/NVPTX/hidden_diverge.ll @@ -0,0 +1,30 @@ +; RUN: opt %s -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +define i32 @hidden_diverge(i32 %n, i32 %a, i32 %b) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_diverge' +entry: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %cond.var = icmp slt i32 %tid, 0 + br i1 %cond.var, label %B, label %C ; divergent +; CHECK: DIVERGENT: br i1 %cond.var, +B: + %cond.uni = icmp slt i32 %n, 0 + br i1 %cond.uni, label %C, label %merge ; uniform +; CHECK-NOT: DIVERGENT: br i1 %cond.uni, +C: + %phi.var.hidden = phi i32 [ 1, %entry ], [ 2, %B ] +; CHECK: DIVERGENT: %phi.var.hidden = phi i32 + br label %merge +merge: + %phi.ipd = phi i32 [ %a, %B ], [ %b, %C ] +; CHECK: DIVERGENT: %phi.ipd = phi i32 + ret i32 %phi.ipd +} + +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() + +!nvvm.annotations = !{!0} +!0 = !{i32 (i32, i32, i32)* @hidden_diverge, !"kernel", i32 1} Index: test/Analysis/DivergenceAnalysis/NVPTX/irreducible.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/NVPTX/irreducible.ll @@ -0,0 +1,55 @@ +; RUN: opt %s -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; This test contains an unstructured loop. +; +-------------- entry ----------------+ +; | | +; V V +; i1 = phi(0, i3) i2 = phi(0, i3) +; j1 = i1 + 1 ---> i3 = phi(j1, j2) <--- j2 = i2 + 2 +; ^ | ^ +; | V | +; +-------- switch (tid / i3) ----------+ +; | +; V +; if (i3 == 5) // divergent +; because sync dependent on (tid / i3). +define i32 @unstructured_loop(i1 %entry_cond) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'unstructured_loop' +entry: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + br i1 %entry_cond, label %loop_entry_1, label %loop_entry_2 +loop_entry_1: + %i1 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ] + %j1 = add i32 %i1, 1 + br label %loop_body +loop_entry_2: + %i2 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ] + %j2 = add i32 %i2, 2 + br label %loop_body +loop_body: + %i3 = phi i32 [ %j1, %loop_entry_1 ], [ %j2, %loop_entry_2 ] + br label %loop_latch +loop_latch: + %div = sdiv i32 %tid, %i3 + switch i32 %div, label %branch [ i32 1, label %loop_entry_1 + i32 2, label %loop_entry_2 ] +branch: + %cmp = icmp eq i32 %i3, 5 + br i1 %cmp, label %then, label %else +; CHECK: DIVERGENT: br i1 %cmp, +then: + ret i32 0 +else: + ret i32 1 +} + +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() +declare i32 @llvm.nvvm.read.ptx.sreg.laneid() + +!nvvm.annotations = !{!0} +!0 = !{i32 (i1)* @unstructured_loop, !"kernel", i32 1} Index: test/Analysis/LegacyDivergenceAnalysis/AMDGPU/kernel-args.ll =================================================================== --- test/Analysis/LegacyDivergenceAnalysis/AMDGPU/kernel-args.ll +++ test/Analysis/LegacyDivergenceAnalysis/AMDGPU/kernel-args.ll @@ -1,6 +1,6 @@ ; RUN: opt %s -mtriple amdgcn-- -analyze -divergence | FileCheck %s -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'test_amdgpu_ps': +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'test_amdgpu_ps': ; CHECK: DIVERGENT: ; CHECK-NOT: %arg0 ; CHECK-NOT: %arg1 @@ -14,7 +14,7 @@ ret void } -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'test_amdgpu_kernel': +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'test_amdgpu_kernel': ; CHECK-NOT: %arg0 ; CHECK-NOT: %arg1 ; CHECK-NOT: %arg2 @@ -26,7 +26,7 @@ ret void } -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'test_c': +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'test_c': ; CHECK: DIVERGENT: ; CHECK: DIVERGENT: ; CHECK: DIVERGENT: Index: test/Analysis/LegacyDivergenceAnalysis/AMDGPU/lit.local.cfg =================================================================== --- /dev/null +++ test/Analysis/LegacyDivergenceAnalysis/AMDGPU/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'AMDGPU' in config.root.targets: + config.unsupported = True Index: test/Analysis/LegacyDivergenceAnalysis/NVPTX/diverge.ll =================================================================== --- test/Analysis/LegacyDivergenceAnalysis/NVPTX/diverge.ll +++ test/Analysis/LegacyDivergenceAnalysis/NVPTX/diverge.ll @@ -5,7 +5,7 @@ ; return (n < 0 ? a + threadIdx.x : b + threadIdx.x) define i32 @no_diverge(i32 %n, i32 %a, i32 %b) { -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'no_diverge' +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'no_diverge' entry: %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() %cond = icmp slt i32 %n, 0 @@ -27,7 +27,7 @@ ; c = b; ; return c; // c is divergent: sync dependent define i32 @sync(i32 %a, i32 %b) { -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'sync' +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'sync' bb1: %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.y() %cond = icmp slt i32 %tid, 5 @@ -48,7 +48,7 @@ ; // c here is divergent because it is sync dependent on threadIdx.x >= 5 ; return c; define i32 @mixed(i32 %n, i32 %a, i32 %b) { -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'mixed' +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'mixed' bb1: %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.z() %cond = icmp slt i32 %tid, 5 @@ -73,7 +73,7 @@ ; We conservatively treats all parameters of a __device__ function as divergent. define i32 @device(i32 %n, i32 %a, i32 %b) { -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'device' +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'device' ; CHECK: DIVERGENT: i32 %n ; CHECK: DIVERGENT: i32 %a ; CHECK: DIVERGENT: i32 %b @@ -98,7 +98,7 @@ ; ; The i defined in the loop is used outside. define i32 @loop() { -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'loop' +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'loop' entry: %laneid = call i32 @llvm.nvvm.read.ptx.sreg.laneid() br label %loop @@ -120,7 +120,7 @@ ; Same as @loop, but the loop is in the LCSSA form. define i32 @lcssa() { -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'lcssa' +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'lcssa' entry: %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() br label %loop @@ -156,7 +156,7 @@ ; if (i3 == 5) // divergent ; because sync dependent on (tid / i3). define i32 @unstructured_loop(i1 %entry_cond) { -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'unstructured_loop' +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'unstructured_loop' entry: %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() br i1 %entry_cond, label %loop_entry_1, label %loop_entry_2 Index: test/Analysis/LegacyDivergenceAnalysis/NVPTX/lit.local.cfg =================================================================== --- /dev/null +++ test/Analysis/LegacyDivergenceAnalysis/NVPTX/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'NVPTX' in config.root.targets: + config.unsupported = True