Index: include/llvm/ADT/PostOrderIterator.h =================================================================== --- include/llvm/ADT/PostOrderIterator.h +++ include/llvm/ADT/PostOrderIterator.h @@ -296,12 +296,15 @@ public: using rpo_iterator = typename std::vector::reverse_iterator; + using const_rpo_iterator = typename std::vector::const_reverse_iterator; ReversePostOrderTraversal(GraphT G) { Initialize(GT::getEntryNode(G)); } // Because we want a reverse post order, use reverse iterators from the vector rpo_iterator begin() { return Blocks.rbegin(); } + const_rpo_iterator begin() const { return Blocks.crbegin(); } rpo_iterator end() { return Blocks.rend(); } + const_rpo_iterator end() const { return Blocks.crend(); } }; } // end namespace llvm Index: include/llvm/Analysis/DivergenceAnalysis.h =================================================================== --- /dev/null +++ include/llvm/Analysis/DivergenceAnalysis.h @@ -0,0 +1,245 @@ +//===- llvm/Analysis/DivergenceAnalysis.h - Divergence Analysis -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// \file +// The divergence analysis determines which instructions and branches are +// divergent given a set of divergent source instructions. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H +#define LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H + +#include "llvm/ADT/DenseSet.h" +#include "llvm/Analysis/SyncDependenceAnalysis.h" +#include "llvm/IR/Function.h" +#include "llvm/Pass.h" +#include + +namespace llvm { +class Module; +class Value; +class Instruction; +class Loop; +class raw_ostream; +class TargetTransformInfo; + +/// \brief Generic divergence analysis for reducible CFGs. +/// +/// This analysis propagates divergence in a data-parallel context from sources +/// of divergence to all users. It requires reducible CFGs. All assignments +/// should be in SSA form. +class DivergenceAnalysis { +public: + /// \brief This instance will analyze the whole function \p or the loop \p + /// RegionLoop. + /// + /// \param RegionLoop If non-null the analysis is restricted to \p RegionLoop. + /// Otherwise the whole function is analyzed. + /// \param IsLCSSAFOrm whether the analysis may assume that the IR in the + /// region in in LCSSA form. + DivergenceAnalysis(const Function &F, const Loop *RegionLoop, + const DominatorTree &DT, const LoopInfo &LI, + SyncDependenceAnalysis &SDA, bool IsLCSSAForm); + + /// \brief the loop that defines the analyzed region (if any). + const Loop *getRegionLoop() const { return RegionLoop; } + const Function &getFunction() const { return F; } + + /// \brief Whether \p BB is part of the region. + bool inRegion(const BasicBlock &BB) const; + /// \brief Whether \p I is part of the region. + bool inRegion(const Instruction &I) const; + + /// \brief Mark \p UniVal as a value that is always uniform. + void addUniformOverride(const Value &UniVal); + + /// \brief Mark \p DivVal as a value that is always divergent. + void markDivergent(const Value &DivVal); + + /// \brief Propagate divergence to all instructions in the region. + /// Divergence is seeded by calls to \p markDivergent. + void compute(); + + /// \brief Whether any value was marked or analyzed to be divergent. + bool hasDetectedDivergence() const { return !DivergentValues.empty(); } + + /// \brief Whether \p Val will always return a uniform value regardless of its + /// operands + bool isAlwaysUniform(const Value &Val) const; + + /// \brief Whether \p Val is a divergent value + bool isDivergent(const Value &Val) const; + + void print(raw_ostream &OS, const Module *) const; + +private: + bool updateTerminator(const TerminatorInst &Term) const; + bool updatePHINode(const PHINode &Phi) const; + + /// \returns whether \p Inst should be considered divergent based on the + /// divergence of its operands. + /// + /// This should only be called for non-phi, non-terminator instructions. + bool updateNormalInstruction(const Instruction &Inst) const; + + /// \brief mark users of live-out users as divergent. + /// + /// \param LoopHeader the header of the divergent loop. + /// + /// marks all users of live-out values of the loop headed by @loopHeader + /// as divergent and puts them on the worklist. + void taintLoopLiveOuts(const BasicBlock &LoopHeader); + + /// \brief Push all users of \p Val (in the region) to the worklist + void pushUsers(const Value &I); + + /// \brief Push all phi nodes in @block to the worklist + void pushPHINodes(const BasicBlock &Block); + + /// \brief Mark \p Block as join divergent + /// + /// A block is join divergent if two threads may reach it from different + /// incoming blocks at the same time. + void markBlockJoinDivergent(const BasicBlock &Block) { + DivergentJoinBlocks.insert(&Block); + } + + /// \brief Whether \p Val is divergent when read in \p ObservingBlock. + bool isTemporalDivergent(const BasicBlock &ObservingBlock, + const Value &Val) const; + + /// \brief Whether \p Block is join divergent + /// + /// (see markBlockJoinDivergent). + bool isJoinDivergent(const BasicBlock &Block) const { + return DivergentJoinBlocks.count(&Block); + } + + /// \brief Propagate control-induced divergence to users (phi nodes and + /// instructions). + // + // \param JoinBlock is a divergent loop exit or join point of two disjoint + // paths. + // \returns Whether \p JoinBlock is a divergent loop exit of \p TermLoop. + bool propagateJoinDivergence(const BasicBlock &JoinBlock, + const Loop *TermLoop); + + /// \brief Propagate induced value divergence due to control divergence in \p + /// Term. + void propagateBranchDivergence(const TerminatorInst &Term); + + /// \brief Propagate divergent caused by a divergent loop exit. + /// + /// \param ExitingLoop is a divergent loop. + void propagateLoopDivergence(const Loop &ExitingLoop); + +private: + const Function &F; + // If regionLoop != nullptr, analysis is only performed within \p RegionLoop. + // Otw, analyze the whole function + const Loop *RegionLoop; + + const DominatorTree &DT; + const LoopInfo &LI; + + // Recognized divergent loops + DenseSet DivergentLoops; + + // The SDA links divergent branches to divergent control-flow joins. + SyncDependenceAnalysis &SDA; + + // Use simplified code path for LCSSA form. + bool IsLCSSAForm; + + // Set of known-uniform values. + DenseSet UniformOverrides; + + // Blocks with joining divergent control from different predecessors. + DenseSet DivergentJoinBlocks; + + // Detected/marked divergent values. + DenseSet DivergentValues; + + // Internal worklist for divergence propagation. + std::vector Worklist; +}; + +/// \brief Divergence analysis frontend for loops. +class LoopDivergenceAnalysis { +public: + LoopDivergenceAnalysis(const DominatorTree &DT, const LoopInfo &LI, + SyncDependenceAnalysis &SDA, const Loop &loop); + + /// Whether \p V is divergent. + bool isDivergent(const Value &V) const; + + /// Whether \p V is uniform/non-divergent. + bool isUniform(const Value &V) const { return !isDivergent(V); } + + /// Print all divergent values in the loop. + void print(raw_ostream &OS, const Module *) const; + +private: + DivergenceAnalysis DA; +}; + +/// \brief Loop divergence printer pass. +/// This is intended for use in LIT testing. +class LoopDivergencePrinter : public FunctionPass { +public: + static char ID; + + LoopDivergencePrinter() : FunctionPass(ID) { + initializeLoopDivergencePrinterPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override; + + /// Analyze all loop-divergence of all loops in @F and print the results. + bool runOnFunction(Function &F) override; + + /// Print all divergent values in the function. + void print(raw_ostream &OS, const Module *) const override; + +private: + std::unique_ptr SDA; + SmallVector, 6> LoopDivInfo; +}; + +/// \brief Divergence analysis frontend for GPU kernels. +class GPUDivergenceAnalysis { + SyncDependenceAnalysis SDA; + DivergenceAnalysis DA; + +public: + /// Runs the divergence analysis on @F, a GPU kernel + GPUDivergenceAnalysis(Function &F, const DominatorTree &DT, + const PostDominatorTree &PDT, const LoopInfo &LI, + const TargetTransformInfo &TTI); + + /// Whether any divergence was detected. + bool hasDivergence() const { return DA.hasDetectedDivergence(); } + + /// The GPU kernel this analysis result is for + const Function &getFunction() const { return DA.getFunction(); } + + /// Whether \p V is divergent. + bool isDivergent(const Value &V) const; + + /// Whether \p V is uniform/non-divergent + bool isUniform(const Value &V) const { return !isDivergent(V); } + + /// Print all divergent values in the kernel. + void print(raw_ostream &OS, const Module *) const; +}; + +} // namespace llvm + +#endif // LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H Index: include/llvm/Analysis/LegacyDivergenceAnalysis.h =================================================================== --- include/llvm/Analysis/LegacyDivergenceAnalysis.h +++ include/llvm/Analysis/LegacyDivergenceAnalysis.h @@ -19,9 +19,11 @@ #include "llvm/ADT/DenseSet.h" #include "llvm/IR/Function.h" #include "llvm/Pass.h" +#include "llvm/Analysis/DivergenceAnalysis.h" namespace llvm { class Value; +class GPUDivergenceAnalysis; class LegacyDivergenceAnalysis : public FunctionPass { public: static char ID; @@ -41,7 +43,7 @@ // // Even if this function returns false, V may still be divergent when used // in a different basic block. - bool isDivergent(const Value *V) const { return DivergentValues.count(V); } + bool isDivergent(const Value *V) const; // Returns true if V is uniform/non-divergent. // @@ -53,6 +55,12 @@ void removeValue(const Value *V) { DivergentValues.erase(V); } private: + // Whether analysis should be performed by GPUDivergenceAnalysis. + bool shouldUseGPUDivergenceAnalysis(const Function &F) const; + + // (optional) handle to new DivergenceAnalysis + std::unique_ptr gpuDA; + // Stores all divergent values. DenseSet DivergentValues; }; Index: include/llvm/Analysis/Passes.h =================================================================== --- include/llvm/Analysis/Passes.h +++ include/llvm/Analysis/Passes.h @@ -66,6 +66,13 @@ // FunctionPass *createLegacyDivergenceAnalysisPass(); + //===--------------------------------------------------------------------===// + // + // createLoopDivergencePrinterPass - This pass determines which branches and + // instructions in a loop are divergent. + // + FunctionPass *createLoopDivergencePrinterPass(); + //===--------------------------------------------------------------------===// // // Minor pass prototypes, allowing us to expose them through bugpoint and Index: include/llvm/Analysis/SyncDependenceAnalysis.h =================================================================== --- /dev/null +++ include/llvm/Analysis/SyncDependenceAnalysis.h @@ -0,0 +1,88 @@ +//===- SyncDependenceAnalysis.h - Divergent Branch Dependence -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// \file +// This file defines the SyncDependenceAnalysis class, which computes for +// every divergent branch the set of phi nodes that the branch will make +// divergent. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ANALYSIS_SYNC_DEPENDENCE_ANALYSIS_H +#define LLVM_ANALYSIS_SYNC_DEPENDENCE_ANALYSIS_H + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/Analysis/LoopInfo.h" +#include + +namespace llvm { + +class BasicBlock; +class DominatorTree; +class Loop; +class PostDominatorTree; +class TerminatorInst; +class TerminatorInst; + +using ConstBlockSet = SmallPtrSet; + +/// \brief Relates points of divergent control to join points in +/// reducible CFGs. +/// +/// This analysis relates points of divergent control to points of converging +/// divergent control. The analysis requires all loops to be reducible. +class SyncDependenceAnalysis { + void visitSuccessor(const BasicBlock &succBlock, const Loop *termLoop, + const BasicBlock *defBlock); + +public: + bool inRegion(const BasicBlock &BB) const; + + ~SyncDependenceAnalysis(); + SyncDependenceAnalysis(const DominatorTree &DT, const PostDominatorTree &PDT, + const LoopInfo &LI); + + /// \brief Computes divergent join points and loop exits caused by divergent + /// in \p Term. + /// + /// The set of blocks which are reachable by disjoint paths from \p Term. + /// The set also contains loop exits if there two disjoin paths: + /// one from \p Term to the loop exit and another from @term to the loop + /// header. Those exit blocks are added to the returned set. If L is the + /// parent loop of @term and an exit of L is in the returned set then L is a + /// divergent loop. + const ConstBlockSet &join_blocks(const TerminatorInst &term); + + /// \brief Computes divergent join points and loop exits caused by the + /// divergent loop \p Loop. + /// + /// The set of blocks which are reachable by disjoint paths from the + /// loop exits of @loop. + /// This treats the loop as a single node in @loop's parent loop. + /// The returned set has the same properties as for join_blocks(TermInst&). + const ConstBlockSet &join_blocks(const Loop &Loop); + +private: + static ConstBlockSet EmptyBlockSet; + + ReversePostOrderTraversal FuncRPOT; + const DominatorTree &DT; + const PostDominatorTree &PDT; + const LoopInfo &LI; + + std::map> CachedLoopExitJoins; + std::map> + CachedBranchJoins; +}; + +} // namespace llvm + +#endif // LLVM_ANALYSIS_SYNC_DEPENDENCE_ANALYSIS_H Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -211,6 +211,7 @@ void initializeLoopDataPrefetchLegacyPassPass(PassRegistry&); void initializeLoopDeletionLegacyPassPass(PassRegistry&); void initializeLoopDistributeLegacyPass(PassRegistry&); +void initializeLoopDivergencePrinterPass(PassRegistry&); void initializeLoopExtractorPass(PassRegistry&); void initializeLoopGuardWideningLegacyPassPass(PassRegistry&); void initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry&); Index: include/llvm/LinkAllPasses.h =================================================================== --- include/llvm/LinkAllPasses.h +++ include/llvm/LinkAllPasses.h @@ -122,6 +122,7 @@ (void) llvm::createLCSSAPass(); (void) llvm::createLegacyDivergenceAnalysisPass(); (void) llvm::createLICMPass(); + (void) llvm::createLoopDivergencePrinterPass(); (void) llvm::createLoopSinkPass(); (void) llvm::createLazyValueInfoPass(); (void) llvm::createLoopExtractorPass(); Index: lib/Analysis/Analysis.cpp =================================================================== --- lib/Analysis/Analysis.cpp +++ lib/Analysis/Analysis.cpp @@ -59,6 +59,7 @@ initializeLazyValueInfoPrinterPass(Registry); initializeLegacyDivergenceAnalysisPass(Registry); initializeLintPass(Registry); + initializeLoopDivergencePrinterPass(Registry); initializeLoopInfoWrapperPassPass(Registry); initializeMemDepPrinterPass(Registry); initializeMemDerefPrinterPass(Registry); Index: lib/Analysis/CMakeLists.txt =================================================================== --- lib/Analysis/CMakeLists.txt +++ lib/Analysis/CMakeLists.txt @@ -25,6 +25,7 @@ Delinearization.cpp DemandedBits.cpp DependenceAnalysis.cpp + DivergenceAnalysis.cpp DomPrinter.cpp DominanceFrontier.cpp EHPersonalities.cpp @@ -79,6 +80,7 @@ ScalarEvolutionAliasAnalysis.cpp ScalarEvolutionExpander.cpp ScalarEvolutionNormalization.cpp + SyncDependenceAnalysis.cpp SyntheticCountsUtils.cpp TargetLibraryInfo.cpp TargetTransformInfo.cpp Index: lib/Analysis/DivergenceAnalysis.cpp =================================================================== --- /dev/null +++ lib/Analysis/DivergenceAnalysis.cpp @@ -0,0 +1,537 @@ +//===- DivergenceAnalysis.cpp --------- Divergence Analysis Implementation -==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a general divergence analysis for loop vectorization +// and GPU programs. It determines whether a branch in a loop or GPU program +// is divergent. It can help branch optimizations such as jump threading and +// loop unswitching to make better decisions. +// +// GPU programs typically use the SIMD execution model, where multiple threads +// in the same execution group have to execute in lock-step. Therefore, if the +// code contains divergent branches (i.e., threads in a group do not agree on +// which path of the branch to take), the group of threads has to execute all +// the paths from that branch with different subsets of threads enabled until +// they converge at the immediately post-dominating BB of the paths. +// +// Due to this execution model, some optimizations such as jump +// threading and loop unswitching can be unfortunately harmful when performed on +// divergent branches. Therefore, an analysis that computes which branches in a +// GPU program are divergent can help the compiler to selectively run these +// optimizations. +// +// This file defines divergence analysis which computes a conservative but +// non-trivial approximation of all divergent branches in a GPU program. This +// implementation is derived from the Vectorization Analysis of the Region +// Vectorizer (RV). That implementation in turn is based on the approach +// described in +// +// Improving Performance of OpenCL on CPUs +// Ralf Karrenberg and Sebastian Hack +// CC '12 +// +// The DivergenceAnalysis implementation is generic in the sense that it does +// not itself identify original sources of divergence. +// Instead specialized adapter classes, (LoopDivergenceAnalysis) for loops and +// (GPUDivergenceAnalysis) for GPU programs, identify the sources of divergence +// (e.g., special variables that hold the thread ID or the iteration variable). +// +// The generic implementation propagates divergence to variables that are data +// or sync dependent on a source of divergence. +// +// While data dependency is a well-known concept, the notion of sync dependency +// is worth more explanation. Sync dependence characterizes the control flow +// aspect of the propagation of branch divergence. For example, +// +// %cond = icmp slt i32 %tid, 10 +// br i1 %cond, label %then, label %else +// then: +// br label %merge +// else: +// br label %merge +// merge: +// %a = phi i32 [ 0, %then ], [ 1, %else ] +// +// Suppose %tid holds the thread ID. Although %a is not data dependent on %tid +// because %tid is not on its use-def chains, %a is sync dependent on %tid +// because the branch "br i1 %cond" depends on %tid and affects which value %a +// is assigned to. +// +// The sync dependence detection (which branch induces divergence in which join +// points) is implemented in the SyncDependenceAnalysis. +// +// The current DivergenceAnalysis implementation has the following limitations: +// 1. intra-procedural. It conservatively considers the arguments of a +// non-kernel-entry function and the return value of a function call as +// divergent. +// 2. memory as black box. It conservatively considers values loaded from +// generic or local address as divergent. This can be improved by leveraging +// pointer analysis and/or by modelling non-escaping memory objects in SSA +// as done in RV. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/Passes.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include + +using namespace llvm; + +#define DEBUG_TYPE "divergence-analysis" + +// class DivergenceAnalysis +DivergenceAnalysis::DivergenceAnalysis( + const Function &F, const Loop *RegionLoop, const DominatorTree &DT, + const LoopInfo &LI, SyncDependenceAnalysis &SDA, bool IsLCSSAForm) + : F(F), RegionLoop(RegionLoop), DT(DT), LI(LI), SDA(SDA), + IsLCSSAForm(IsLCSSAForm) {} + +void DivergenceAnalysis::markDivergent(const Value &DivVal) { + assert(isa(DivVal) || isa(DivVal)); + assert(!isAlwaysUniform(DivVal) && "cannot be a divergent"); + DivergentValues.insert(&DivVal); +} + +void DivergenceAnalysis::addUniformOverride(const Value &UniVal) { + UniformOverrides.insert(&UniVal); +} + +bool DivergenceAnalysis::updateTerminator(const TerminatorInst &Term) const { + if (Term.getNumSuccessors() <= 1) + return false; + if (auto *BranchTerm = dyn_cast(&Term)) { + assert(BranchTerm->isConditional()); + return isDivergent(*BranchTerm->getCondition()); + } + if (auto *SwitchTerm = dyn_cast(&Term)) { + return isDivergent(*SwitchTerm->getCondition()); + } + if (isa(Term)) { + return false; // ignore abnormal executions through landingpad + } + + abort(); +} + +bool DivergenceAnalysis::updateNormalInstruction(const Instruction &I) const { + // TODO function calls with side effects, etc + for (const auto &Op : I.operands()) { + if (isDivergent(*Op)) + return true; + } + return false; +} + +bool DivergenceAnalysis::isTemporalDivergent(const BasicBlock &ObservingBlock, + const Value &Val) const { + const auto *Inst = dyn_cast(&Val); + if (!Inst) + return false; + // check whether any divergent loop carrying @val terminates before control + // proceeds to @observingBlock + for (const auto *Loop = LI.getLoopFor(Inst->getParent()); + Loop != RegionLoop && !Loop->contains(&ObservingBlock); + Loop = Loop->getParentLoop()) { + if (DivergentLoops.find(Loop) != DivergentLoops.end()) + return true; + } + + return false; +} + +bool DivergenceAnalysis::updatePHINode(const PHINode &Phi) const { + // joining divergent disjoint path in @phi parent block + if (!Phi.hasConstantOrUndefValue() && isJoinDivergent(*Phi.getParent())) { + return true; + } + + // An incoming value could be divergent by itself. + // Otherwise, an incoming value could be uniform within the loop + // that carries its definition but it may appear divergent + // from outside the loop. This happens when divergent loop exits + // drop definitions of that uniform value in different iterations. + // + // for (int i = 0; i < n; ++i) { // 'i' is uniform inside the loop + // if (i % thread_id == 0) break; // divergent loop exit + // } + // int divI = i; // divI is divergent + for (size_t i = 0; i < Phi.getNumIncomingValues(); ++i) { + const auto *InVal = Phi.getIncomingValue(i); + if (isDivergent(*Phi.getIncomingValue(i)) || + isTemporalDivergent(*Phi.getParent(), *InVal)) { + return true; + } + } + return false; +} + +bool DivergenceAnalysis::inRegion(const Instruction &I) const { + return I.getParent() && inRegion(*I.getParent()); +} + +bool DivergenceAnalysis::inRegion(const BasicBlock &BB) const { + return (!RegionLoop && BB.getParent() == &F) || RegionLoop->contains(&BB); +} + +// marks all users of loop-carried values of the loop headed by @loopHeader as +// divergent +void DivergenceAnalysis::taintLoopLiveOuts(const BasicBlock &LoopHeader) { + auto *DivLoop = LI.getLoopFor(&LoopHeader); + assert(DivLoop && "loopHeader is not actually part of a loop"); + + SmallVector TaintStack; + DivLoop->getExitBlocks(TaintStack); + + // Otherwise potential users of loop-carried values could be anywhere in the + // dominance region of @DivLoop (including its fringes for phi nodes) + DenseSet Visited; + for (auto *Block : TaintStack) { + Visited.insert(Block); + } + Visited.insert(&LoopHeader); + + while (!TaintStack.empty()) { + auto *UserBlock = TaintStack.back(); + TaintStack.pop_back(); + + // don't spread divergence beyond the region + if (!inRegion(*UserBlock)) + continue; + + assert(!DivLoop->contains(UserBlock) && + "irreducible control flow detected"); + + // phi nodes at the fringes of the dominance region + if (!DT.dominates(&LoopHeader, UserBlock)) { + // all PHI nodes of @userBlock become divergent + for (auto &Phi : UserBlock->phis()) { + Worklist.push_back(&Phi); + } + continue; + } + + // taint outside users of values carried by @DivLoop + for (auto &I : *UserBlock) { + if (isAlwaysUniform(I)) + continue; + if (isDivergent(I)) + continue; + + for (auto &Op : I.operands()) { + auto *OpInst = dyn_cast(&Op); + if (!OpInst) + continue; + if (DivLoop->contains(OpInst->getParent())) { + markDivergent(I); + pushUsers(I); + break; + } + } + } + + // visit all blocks in the dominance region + for (auto *SuccBlock : successors(UserBlock)) { + if (!Visited.insert(SuccBlock).second) { + continue; + } + TaintStack.push_back(SuccBlock); + } + } +} + +void DivergenceAnalysis::pushPHINodes(const BasicBlock &Block) { + for (const auto &Phi : Block.phis()) { + if (isDivergent(Phi)) + continue; + Worklist.push_back(&Phi); + } +} + +void DivergenceAnalysis::pushUsers(const Value &V) { + for (const auto *User : V.users()) { + const auto *UserInst = dyn_cast(User); + if (!UserInst) + continue; + + if (isDivergent(*UserInst)) + continue; + + // only compute divergent inside loop + if (!inRegion(*UserInst)) + continue; + Worklist.push_back(UserInst); + } +} + +bool DivergenceAnalysis::propagateJoinDivergence(const BasicBlock &JoinBlock, + const Loop *BranchLoop) { + LLVM_DEBUG(dbgs() << "\tpropJoinDiv " << JoinBlock.getName() << "\n"); + + // ignore divergence outside the region + if (!inRegion(JoinBlock)) { + return false; + } + + // push non-divergent phi nodes in @joinBlock to the worklist + pushPHINodes(JoinBlock); + + // @joinBlock is a divergent loop exit + if (BranchLoop && !BranchLoop->contains(&JoinBlock)) { + return true; + } + + // disjoint-paths divergent at @joinBlock + markBlockJoinDivergent(JoinBlock); + return false; +} + +void DivergenceAnalysis::propagateBranchDivergence(const TerminatorInst &Term) { + LLVM_DEBUG(dbgs() << "propBranchDiv " << Term.getParent()->getName() << "\n"); + + markDivergent(Term); + + const auto *BranchLoop = LI.getLoopFor(Term.getParent()); + + // whether there is a divergent loop exit from @branchLoop (if any) + bool IsBranchLoopDivergent = false; + + // iterate over all blocks reachable by disjoint from @term within the loop + // also iterates over loop exits that become divergent due to @term. + for (const auto *JoinBlock : SDA.join_blocks(Term)) { + IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop); + } + + // @branch loop is a divergent loop due to the divergent branch in @term + if (IsBranchLoopDivergent) { + assert(BranchLoop); + if (!DivergentLoops.insert(BranchLoop).second) { + return; + } + propagateLoopDivergence(*BranchLoop); + } +} + +void DivergenceAnalysis::propagateLoopDivergence(const Loop &ExitingLoop) { + LLVM_DEBUG(dbgs() << "propLoopDiv " << ExitingLoop.getName() << "\n"); + + // don't propagate beyond region + if (!inRegion(*ExitingLoop.getHeader())) + return; + + const auto *BranchLoop = ExitingLoop.getParentLoop(); + + // Uses of loop-carried values could occur anywhere + // within the dominance region of the definition. All loop-carried + // definitions are dominated by the loop header (reducible control). + // Thus all users have to be in the dominance region of the loop header, + // except PHI nodes that can also live at the fringes of the dom region + // (incoming defining value). + if (!IsLCSSAForm) + taintLoopLiveOuts(*ExitingLoop.getHeader()); + + // whether there is a divergent loop exit from @branchLoop (if any) + bool IsBranchLoopDivergent = false; + + // iterate over all blocks reachable by disjoint paths from exits of + // @exitingLoop also iterates over loop exits (of @branchLoop) that in turn + // become divergent. + for (const auto *JoinBlock : SDA.join_blocks(ExitingLoop)) { + IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop); + } + + // @branch loop is a divergent due to divergent loop exit in @exitingLoop + if (IsBranchLoopDivergent) { + assert(BranchLoop); + if (!DivergentLoops.insert(BranchLoop).second) { + return; + } + propagateLoopDivergence(*BranchLoop); + } +} + +void DivergenceAnalysis::compute() { + for (auto *DivVal : DivergentValues) { + pushUsers(*DivVal); + } + + // propagate divergence + while (!Worklist.empty()) { + const Instruction &I = *Worklist.back(); + Worklist.pop_back(); + + // maintain uniformity of overrides + if (isAlwaysUniform(I)) + continue; + + bool WasDivergent = isDivergent(I); + if (WasDivergent) + continue; + + // propagate divergence caused by terminator + if (isa(I)) { + auto &Term = cast(I); + if (updateTerminator(Term)) { + // propagate control divergence to affected instructions + propagateBranchDivergence(Term); + continue; + } + } + + // update divergence of I due to divergent operands + bool DivergentUpd = false; + const auto *Phi = dyn_cast(&I); + if (Phi) { + DivergentUpd = updatePHINode(*Phi); + } else { + DivergentUpd = updateNormalInstruction(I); + } + + // propagate value divergence to users + if (DivergentUpd) { + markDivergent(I); + pushUsers(I); + } + } +} + +bool DivergenceAnalysis::isAlwaysUniform(const Value &V) const { + return UniformOverrides.find(&V) != UniformOverrides.end(); +} + +bool DivergenceAnalysis::isDivergent(const Value &V) const { + return DivergentValues.find(&V) != DivergentValues.end(); +} + +void DivergenceAnalysis::print(raw_ostream &OS, const Module *) const { + if (DivergentValues.empty()) + return; + // iterate instructions using instructions() to ensure a deterministic order. + for (auto &I : instructions(F)) { + if (isDivergent(I)) + OS << "DIVERGENT:" << I << '\n'; + } +} + +// class LoopDivergenceAnalysis +LoopDivergenceAnalysis::LoopDivergenceAnalysis(const DominatorTree &DT, + const LoopInfo &LI, + SyncDependenceAnalysis &SDA, + const Loop &Loop) + : DA(*Loop.getHeader()->getParent(), &Loop, DT, LI, SDA, true) { + for (const auto &Phi : Loop.getHeader()->phis()) { + DA.markDivergent(Phi); + } + + // after the scalar remainder loop is extracted, the loop exit condition will + // be uniform + auto LoopExitingInst = Loop.getExitingBlock()->getTerminator(); + auto LoopExitCond = cast(LoopExitingInst)->getCondition(); + DA.addUniformOverride(*LoopExitCond); + + DA.compute(); +} + +bool LoopDivergenceAnalysis::isDivergent(const Value &V) const { + return DA.isDivergent(V); +} + +void LoopDivergenceAnalysis::print(raw_ostream &OS, const Module *Mod) const { + OS << "Divergence of loop " << DA.getRegionLoop()->getName() << " {\n"; + DA.print(OS, Mod); + OS << "}\n"; +} + +// class LoopDivergencePrinter +bool LoopDivergencePrinter::runOnFunction(Function &F) { + const PostDominatorTree &PDT = + getAnalysis().getPostDomTree(); + const DominatorTree &DT = + getAnalysis().getDomTree(); + const LoopInfo &LI = getAnalysis().getLoopInfo(); + SDA = make_unique(DT, PDT, LI); + + for (auto &BB : F) { + auto *Loop = LI.getLoopFor(&BB); + if (!Loop || Loop->getHeader() != &BB) + continue; + LoopDivInfo.push_back( + make_unique(DT, LI, *SDA, *Loop)); + } + + return false; +} + +void LoopDivergencePrinter::print(raw_ostream &OS, const Module *Mod) const { + for (auto &DivInfo : LoopDivInfo) { + DivInfo->print(OS, Mod); + } +} + +// Register this pass. +char LoopDivergencePrinter::ID = 0; +INITIALIZE_PASS_BEGIN(LoopDivergencePrinter, "loop-divergence", + "Loop Divergence Printer", false, true) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_END(LoopDivergencePrinter, "loop-divergence", + "Loop Divergence Printer", false, true) + +FunctionPass *llvm::createLoopDivergencePrinterPass() { + return new LoopDivergencePrinter(); +} + +void LoopDivergencePrinter::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.setPreservesAll(); +} + +// class GPUDivergenceAnalysis +GPUDivergenceAnalysis::GPUDivergenceAnalysis(Function &F, + const DominatorTree &DT, + const PostDominatorTree &PDT, + const LoopInfo &LI, + const TargetTransformInfo &TTI) + : SDA(DT, PDT, LI), DA(F, nullptr, DT, LI, SDA, false) { + for (auto &I : instructions(F)) { + if (TTI.isSourceOfDivergence(&I)) { + DA.markDivergent(I); + } else if (TTI.isAlwaysUniform(&I)) { + DA.addUniformOverride(I); + } + } + for (auto &Arg : F.args()) { + if (TTI.isSourceOfDivergence(&Arg)) { + DA.markDivergent(Arg); + } + } + + DA.compute(); +} + +bool GPUDivergenceAnalysis::isDivergent(const Value &val) const { + return DA.isDivergent(val); +} + +void GPUDivergenceAnalysis::print(raw_ostream &OS, const Module *mod) const { + OS << "Divergence of kernel " << DA.getFunction().getName() << " {\n"; + DA.print(OS, mod); + OS << "}\n"; +} Index: lib/Analysis/LegacyDivergenceAnalysis.cpp =================================================================== --- lib/Analysis/LegacyDivergenceAnalysis.cpp +++ lib/Analysis/LegacyDivergenceAnalysis.cpp @@ -1,4 +1,5 @@ -//===- LegacyDivergenceAnalysis.cpp --------- Legacy Divergence Analysis Implementation -==// +//===- LegacyDivergenceAnalysis.cpp --------- Legacy Divergence Analysis +//Implementation -==// // // The LLVM Compiler Infrastructure // @@ -64,6 +65,9 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/Passes.h" #include "llvm/Analysis/PostDominators.h" @@ -79,6 +83,12 @@ #define DEBUG_TYPE "divergence" +// transparently use the GPUDivergenceAnalysis +static cl::opt UseGPUDA("use-gpu-divergence-analysis", cl::init(false), + cl::Hidden, + cl::desc("turn the LegacyDivergenceAnalysis into " + "a wrapper for GPUDivergenceAnalysis")); + namespace { class DivergencePropagator { @@ -262,16 +272,17 @@ } } -} /// end namespace anonymous +} // namespace // Register this pass. char LegacyDivergenceAnalysis::ID = 0; -INITIALIZE_PASS_BEGIN(LegacyDivergenceAnalysis, "divergence", "Legacy Divergence Analysis", - false, true) +INITIALIZE_PASS_BEGIN(LegacyDivergenceAnalysis, "divergence", + "Legacy Divergence Analysis", false, true) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) -INITIALIZE_PASS_END(LegacyDivergenceAnalysis, "divergence", "Legacy Divergence Analysis", - false, true) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_END(LegacyDivergenceAnalysis, "divergence", + "Legacy Divergence Analysis", false, true) FunctionPass *llvm::createLegacyDivergenceAnalysisPass() { return new LegacyDivergenceAnalysis(); @@ -280,9 +291,24 @@ void LegacyDivergenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.addRequired(); + if (UseGPUDA) + AU.addRequired(); AU.setPreservesAll(); } +bool LegacyDivergenceAnalysis::shouldUseGPUDivergenceAnalysis( + const Function &F) const { + if (!UseGPUDA) + return false; + + // GPUDivergenceAnalysis requires a reducible CFG. + auto &LI = getAnalysis().getLoopInfo(); + using RPOTraversal = ReversePostOrderTraversal; + RPOTraversal FuncRPOT(&F); + return !containsIrreducibleCFG(FuncRPOT, LI); +} + bool LegacyDivergenceAnalysis::runOnFunction(Function &F) { auto *TTIWP = getAnalysisIfAvailable(); if (TTIWP == nullptr) @@ -295,36 +321,59 @@ return false; DivergentValues.clear(); + gpuDA = nullptr; + + auto &DT = getAnalysis().getDomTree(); auto &PDT = getAnalysis().getPostDomTree(); - DivergencePropagator DP(F, TTI, - getAnalysis().getDomTree(), - PDT, DivergentValues); - DP.populateWithSourcesOfDivergence(); - DP.propagate(); - LLVM_DEBUG( - dbgs() << "\nAfter divergence analysis on " << F.getName() << ":\n"; - print(dbgs(), F.getParent()) - ); + + if (shouldUseGPUDivergenceAnalysis(F)) { + // run the new GPU divergence analysis + auto &LI = getAnalysis().getLoopInfo(); + gpuDA = llvm::make_unique(F, DT, PDT, LI, TTI); + + } else { + // run LLVM's existing DivergenceAnalysis + DivergencePropagator DP(F, TTI, DT, PDT, DivergentValues); + DP.populateWithSourcesOfDivergence(); + DP.propagate(); + } + + LLVM_DEBUG(dbgs() << "\nAfter divergence analysis on " << F.getName() + << ":\n"; + print(dbgs(), F.getParent())); + return false; } +bool LegacyDivergenceAnalysis::isDivergent(const Value *V) const { + if (gpuDA) { + return gpuDA->isDivergent(*V); + } + return DivergentValues.count(V); +} + void LegacyDivergenceAnalysis::print(raw_ostream &OS, const Module *) const { - if (DivergentValues.empty()) + if ((!gpuDA || !gpuDA->hasDivergence()) && DivergentValues.empty()) return; - const Value *FirstDivergentValue = *DivergentValues.begin(); + const Function *F; - if (const Argument *Arg = dyn_cast(FirstDivergentValue)) { - F = Arg->getParent(); - } else if (const Instruction *I = - dyn_cast(FirstDivergentValue)) { - F = I->getParent()->getParent(); - } else { - llvm_unreachable("Only arguments and instructions can be divergent"); + if (!DivergentValues.empty()) { + const Value *FirstDivergentValue = *DivergentValues.begin(); + if (const Argument *Arg = dyn_cast(FirstDivergentValue)) { + F = Arg->getParent(); + } else if (const Instruction *I = + dyn_cast(FirstDivergentValue)) { + F = I->getParent()->getParent(); + } else { + llvm_unreachable("Only arguments and instructions can be divergent"); + } + } else if (gpuDA) { + F = &gpuDA->getFunction(); } // Dumps all divergent values in F, arguments and then instructions. for (auto &Arg : F->args()) { - OS << (DivergentValues.count(&Arg) ? "DIVERGENT: " : " "); + OS << (isDivergent(&Arg) ? "DIVERGENT: " : " "); OS << Arg << "\n"; } // Iterate instructions using instructions() to ensure a deterministic order. @@ -332,7 +381,7 @@ auto &BB = *BI; OS << "\n " << BB.getName() << ":\n"; for (auto &I : BB.instructionsWithoutDebug()) { - OS << (DivergentValues.count(&I) ? "DIVERGENT: " : " "); + OS << (isDivergent(&I) ? "DIVERGENT: " : " "); OS << I << "\n"; } } Index: lib/Analysis/SyncDependenceAnalysis.cpp =================================================================== --- /dev/null +++ lib/Analysis/SyncDependenceAnalysis.cpp @@ -0,0 +1,381 @@ +//===- SyncDependenceAnalysis.cpp - Divergent Branch Dependence Calculation +//--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements an algorithm that returns for a divergent branch +// the set of basic blocks whose phi nodes become divergent due to divergent +// control. These are the blocks that are reachable by two disjoint paths from +// the branch or loop exits that have a reaching path that is disjoint from a +// path to the loop latch. +// +// The SyncDependenceAnalysis is used in the DivergenceAnalysis to model +// control-induced divergence in phi nodes. +// +// -- Summary -- +// The SyncDependenceAnalysis lazily computes sync dependences [3]. +// The analysis evaluates the disjoint path criterion [2] by a reduction +// to SSA construction. The SSA construction algorithm is implemented as +// a simple data-flow analysis [1]. +// +// [1] "A Simple, Fast Dominance Algorithm", SPI '01, Cooper, Harvey and Kennedy +// [2] "Efficiently Computing Static Single Assignment Form +// and the Control Dependence Graph", TOPLAS '91, +// Cytron, Ferrante, Rosen, Wegman and Zadeck +// [3] "Improving Performance of OpenCL on CPUs", CC '12, Karrenberg and Hack +// [4] "Divergence Analysis", TOPLAS '13, Sampaio, Souza, Collange and Pereira +// +// -- Sync dependence -- +// Sync dependence [4] characterizes the control flow aspect of the +// propagation of branch divergence. For example, +// +// %cond = icmp slt i32 %tid, 10 +// br i1 %cond, label %then, label %else +// then: +// br label %merge +// else: +// br label %merge +// merge: +// %a = phi i32 [ 0, %then ], [ 1, %else ] +// +// Suppose %tid holds the thread ID. Although %a is not data dependent on %tid +// because %tid is not on its use-def chains, %a is sync dependent on %tid +// because the branch "br i1 %cond" depends on %tid and affects which value %a +// is assigned to. +// +// -- Reduction to SSA construction -- +// There are two disjoint paths from A to X, if a certain variant of SSA +// construction places a phi node in X under the following set-up scheme [2]. +// +// This variant of SSA construction ignores incoming undef values. +// That is paths from the entry without a definition do not result in +// phi nodes. +// +// entry +// / \ +// A \ +// / \ Y +// B C / +// \ / \ / +// D E +// \ / +// F +// Assume that A contains a divergent branch. We are interested +// in the set of all blocks where each block is reachable from A +// via two disjoint paths. This would be the set {D, F} in this +// case. +// To generally reduce this query to SSA construction we introduce +// a virtual variable x and assign to x different values in each +// successor block of A. +// entry +// / \ +// A \ +// / \ Y +// x = 0 x = 1 / +// \ / \ / +// D E +// \ / +// F +// Our flavor of SSA construction for x will construct the following +// entry +// / \ +// A \ +// / \ Y +// x0 = 0 x1 = 1 / +// \ / \ / +// x2=phi E +// \ / +// x3=phi +// The blocks D and F contain phi nodes and are thus each reachable +// by two disjoins paths from A. +// +// -- Remarks -- +// In case of loop exits we need to check the disjoint path criterion for loops +// [2]. To this end, we check whether the definition of x differs between the +// loop exit and the loop header (_after_ SSA construction). +// +//===----------------------------------------------------------------------===// +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/Analysis/SyncDependenceAnalysis.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" + +#include +#include + +#define DEBUG_TYPE "sync-dependence" + +namespace llvm { + +ConstBlockSet SyncDependenceAnalysis::EmptyBlockSet; + +SyncDependenceAnalysis::SyncDependenceAnalysis(const DominatorTree &DT, + const PostDominatorTree &PDT, + const LoopInfo &LI) + : FuncRPOT(DT.getRoot()->getParent()), DT(DT), PDT(PDT), LI(LI) {} + +SyncDependenceAnalysis::~SyncDependenceAnalysis() {} + +using FunctionRPOT = ReversePostOrderTraversal; + +// divergence propagator for reducible CFGs +struct DivergencePropagator { + const FunctionRPOT &FuncRPOT; + const DominatorTree &DT; + const PostDominatorTree &PDT; + const LoopInfo &LI; + + // identified join points + std::unique_ptr JoinBlocks; + + // reached loop exits (by a path disjoint to a path to the loop header) + SmallPtrSet ReachedLoopExits; + + // if DefMap[B] == C then C is the dominating definition at block B + // if DefMap[B] ~ undef then we haven't seen B yet + // if DefMap[B] == B then B is a join point of disjoint paths from X or B is + // an immediate successor of X (initial value). + using DefiningBlockMap = std::map; + DefiningBlockMap DefMap; + + // all blocks with pending visits + std::unordered_set PendingUpdates; + + DivergencePropagator(const FunctionRPOT &FuncRPOT, const DominatorTree &DT, + const PostDominatorTree &PDT, const LoopInfo &LI) + : FuncRPOT(FuncRPOT), DT(DT), PDT(PDT), LI(LI), + JoinBlocks(new ConstBlockSet) {} + + // set the definition at @block and mark @block as pending for a visit + void addPending(const BasicBlock &Block, const BasicBlock &DefBlock) { + bool WasAdded = DefMap.emplace(&Block, &DefBlock).second; + if (WasAdded) + PendingUpdates.insert(&Block); + } + + void printDefs(raw_ostream &Out) { + Out << "Propagator::DefMap {\n"; + for (const auto *Block : FuncRPOT) { + auto It = DefMap.find(Block); + Out << Block->getName() << " : "; + if (It == DefMap.end()) { + Out << "\n"; + } else { + const auto *DefBlock = It->second; + Out << (DefBlock ? DefBlock->getName() : "") << "\n"; + } + } + Out << "}\n"; + } + + // process @succBlock with reaching definition @defBlock + // the original divergent branch was in @parentLoop (if any) + void visitSuccessor(const BasicBlock &SuccBlock, const Loop *ParentLoop, + const BasicBlock &DefBlock) { + + // @succBlock is a loop exit + if (ParentLoop && !ParentLoop->contains(&SuccBlock)) { + DefMap.emplace(&SuccBlock, &DefBlock); + ReachedLoopExits.insert(&SuccBlock); + return; + } + + // first reaching def? + auto ItLastDef = DefMap.find(&SuccBlock); + if (ItLastDef == DefMap.end()) { + addPending(SuccBlock, DefBlock); + return; + } + + // a join of at least two definitions + if (ItLastDef->second != &DefBlock) { + // do we know this join already? + if (!JoinBlocks->insert(&SuccBlock).second) + return; + + // update the definition + addPending(SuccBlock, SuccBlock); + } + } + + // find all blocks reachable by two disjoint paths from @rootTerm. + // This method works for both divergent TerminatorInsts and loops with + // divergent exits. + // @rootBlock is either the block containing the branch or the header of the + // divergent loop. + // @nodeSuccessors is the set of successors of the node (Loop or Terminator) + // headed by @rootBlock. + // @parentLoop is the parent loop of the Loop or the loop that contains the + // Terminator. + template + std::unique_ptr + computeJoinPoints(const BasicBlock &RootBlock, + SuccessorIterable NodeSuccessors, const Loop *ParentLoop) { + assert(JoinBlocks); + + // immediate post dominator (no join block beyond that block) + const auto *PdNode = PDT.getNode(const_cast(&RootBlock)); + const auto *IpdNode = PdNode->getIDom(); + const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr; + + // bootstrap with branch targets + for (const auto *SuccBlock : NodeSuccessors) { + DefMap.emplace(SuccBlock, SuccBlock); + + if (ParentLoop && !ParentLoop->contains(SuccBlock)) { + // immediate loop exit from node. + ReachedLoopExits.insert(SuccBlock); + continue; + } else { + // regular successor + PendingUpdates.insert(SuccBlock); + } + } + + auto ItBeginRPO = FuncRPOT.begin(); + + // skip until term (TODO RPOT won't let us start at @term directly) + for (; *ItBeginRPO != &RootBlock; ++ItBeginRPO) { + } + + auto ItEndRPO = FuncRPOT.end(); + assert(ItBeginRPO != ItEndRPO); + + // propagate definitions at the immediate successors of the node in RPO + auto ItBlockRPO = ItBeginRPO; + while (++ItBlockRPO != ItEndRPO && *ItBlockRPO != PdBoundBlock) { + const auto *Block = *ItBlockRPO; + + // skip @block if not pending update + auto ItPending = PendingUpdates.find(Block); + if (ItPending == PendingUpdates.end()) + continue; + PendingUpdates.erase(ItPending); + + // propagate definition at @block to its successors + auto ItDef = DefMap.find(Block); + const auto *DefBlock = ItDef->second; + assert(DefBlock); + + auto *BlockLoop = LI.getLoopFor(Block); + if (ParentLoop && + (ParentLoop != BlockLoop && ParentLoop->contains(BlockLoop))) { + // if the successor is the header of a nested loop pretend its a + // single node with the loop's exits as successors + SmallVector BlockLoopExits; + BlockLoop->getExitBlocks(BlockLoopExits); + for (const auto *BlockLoopExit : BlockLoopExits) { + visitSuccessor(*BlockLoopExit, ParentLoop, *DefBlock); + } + + } else { + // the successors are either on the same loop level or loop exits + for (const auto *SuccBlock : successors(Block)) { + visitSuccessor(*SuccBlock, ParentLoop, *DefBlock); + } + } + } + + // We need to know the definition at the parent loop header to decide + // whether the definition at the header is different from the definition at + // the loop exits, which would indicate a divergent loop exits. + // + // A // loop header + // | + // B // nested loop header + // | + // C -> X (exit from B loop) -..-> (A latch) + // | + // D -> back to B (B latch) + // | + // proper exit from both loops + // + // D post-dominates B as it is the only proper exit from the "A loop". + // If C has a divergent branch, propagation will therefore stop at D. + // That implies that B will never receive a definition. + // But that definition can only be the same as at D (D itself in thise case) + // because all paths to anywhere have to pass through D. + // + const BasicBlock *ParentLoopHeader = + ParentLoop ? ParentLoop->getHeader() : nullptr; + if (ParentLoop && ParentLoop->contains(PdBoundBlock)) { + DefMap[ParentLoopHeader] = DefMap[PdBoundBlock]; + } + + // analyze reached loop exits + if (!ReachedLoopExits.empty()) { + assert(ParentLoop); + const auto *HeaderDefBlock = DefMap[ParentLoopHeader]; + LLVM_DEBUG(printDefs(dbgs())); + assert(HeaderDefBlock && "no definition in header of carrying loop"); + + for (const auto *ExitBlock : ReachedLoopExits) { + auto ItExitDef = DefMap.find(ExitBlock); + assert((ItExitDef != DefMap.end()) && + "no reaching def at reachable loop exit"); + if (ItExitDef->second != HeaderDefBlock) { + JoinBlocks->insert(ExitBlock); + } + } + } + + return std::move(JoinBlocks); + } +}; + +const ConstBlockSet &SyncDependenceAnalysis::join_blocks(const Loop &Loop) { + using LoopExitVec = SmallVector; + LoopExitVec LoopExits; + Loop.getExitBlocks(LoopExits); + if (LoopExits.size() < 1) { + return EmptyBlockSet; + } + + // already available in cache? + auto ItCached = CachedLoopExitJoins.find(&Loop); + if (ItCached != CachedLoopExitJoins.end()) + return *ItCached->second; + + // compute all join points + DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI}; + auto JoinBlocks = Propagator.computeJoinPoints( + *Loop.getHeader(), LoopExits, Loop.getParentLoop()); + + auto ItInserted = CachedLoopExitJoins.emplace(&Loop, std::move(JoinBlocks)); + assert(ItInserted.second); + return *ItInserted.first->second; +} + +const ConstBlockSet & +SyncDependenceAnalysis::join_blocks(const TerminatorInst &Term) { + // trivial case + if (Term.getNumSuccessors() < 1) { + return EmptyBlockSet; + } + + // already available in cache? + auto ItCached = CachedBranchJoins.find(&Term); + if (ItCached != CachedBranchJoins.end()) + return *ItCached->second; + + // compute all join points + DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI}; + const auto &TermBlock = *Term.getParent(); + auto JoinBlocks = Propagator.computeJoinPoints( + TermBlock, successors(Term.getParent()), LI.getLoopFor(&TermBlock)); + + auto ItInserted = CachedBranchJoins.emplace(&Term, std::move(JoinBlocks)); + assert(ItInserted.second); + return *ItInserted.first->second; +} + +} // namespace llvm Index: test/Analysis/DivergenceAnalysis/AMDGPU/always_uniform.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/always_uniform.ll @@ -0,0 +1,14 @@ +; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +define amdgpu_kernel void @workitem_id_x() #1 { + %id.x = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK: DIVERGENT: %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %first.lane = call i32 @llvm.amdgcn.readfirstlane(i32 %id.x) +; CHECK-NOT: DIVERGENT: %first.lane = call i32 @llvm.amdgcn.readfirstlane(i32 %id.x) + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 +declare i32 @llvm.amdgcn.readfirstlane(i32) #0 + +attributes #0 = { nounwind readnone } Index: test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll @@ -0,0 +1,45 @@ +; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +; CHECK: DIVERGENT: %orig = atomicrmw xchg i32* %ptr, i32 %val seq_cst +define i32 @test1(i32* %ptr, i32 %val) #0 { + %orig = atomicrmw xchg i32* %ptr, i32 %val seq_cst + ret i32 %orig +} + +; CHECK: DIVERGENT: %orig = cmpxchg i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst +define {i32, i1} @test2(i32* %ptr, i32 %cmp, i32 %new) { + %orig = cmpxchg i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst + ret {i32, i1} %orig +} + +; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false) +define i32 @test_atomic_inc_i32(i32 addrspace(1)* %ptr, i32 %val) #0 { + %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false) + ret i32 %ret +} + +; CHECK: DIVERGENT: %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false) +define i64 @test_atomic_inc_i64(i64 addrspace(1)* %ptr, i64 %val) #0 { + %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false) + ret i64 %ret +} + +; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false) +define i32 @test_atomic_dec_i32(i32 addrspace(1)* %ptr, i32 %val) #0 { + %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false) + ret i32 %ret +} + +; CHECK: DIVERGENT: %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false) +define i64 @test_atomic_dec_i64(i64 addrspace(1)* %ptr, i64 %val) #0 { + %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false) + ret i64 %ret +} + +declare i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32, i32, i1) #1 +declare i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32, i32, i1) #1 +declare i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32, i32, i1) #1 +declare i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32, i32, i1) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind argmemonly } Index: test/Analysis/DivergenceAnalysis/AMDGPU/hidden_diverge.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/hidden_diverge.ll @@ -0,0 +1,26 @@ +; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +define amdgpu_kernel void @hidden_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_diverge' +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %cond.var = icmp slt i32 %tid, 0 + br i1 %cond.var, label %B, label %C ; divergent +; CHECK: DIVERGENT: br i1 %cond.var, +B: + %cond.uni = icmp slt i32 %n, 0 + br i1 %cond.uni, label %C, label %merge ; uniform +; CHECK-NOT: DIVERGENT: br i1 %cond.uni, +C: + %phi.var.hidden = phi i32 [ 1, %entry ], [ 2, %B ] +; CHECK: DIVERGENT: %phi.var.hidden = phi i32 + br label %merge +merge: + %phi.ipd = phi i32 [ %a, %B ], [ %b, %C ] +; CHECK: DIVERGENT: %phi.ipd = phi i32 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } Index: test/Analysis/DivergenceAnalysis/AMDGPU/hidden_loopdiverge.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/hidden_loopdiverge.ll @@ -0,0 +1,223 @@ +; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +; divergent loop (H
, B) +; the divergent join point in %exit is obscured by uniform control joining in %X +define amdgpu_kernel void @hidden_loop_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_loop_diverge': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br i1 %uni.cond, label %X, label %H ; uniform + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %B ] + %div.exitx = icmp slt i32 %tid, 0 + br i1 %div.exitx, label %X, label %B ; divergent branch +; CHECK: DIVERGENT: %div.exitx = +; CHECK: DIVERGENT: br i1 %div.exitx, + +B: + %uni.inc = add i32 %uni.merge.h, 1 + %div.exity = icmp sgt i32 %tid, 0 + br i1 %div.exity, label %Y, label %H ; divergent branch +; CHECK: DIVERGENT: %div.exity = +; CHECK: DIVERGENT: br i1 %div.exity, + +X: + %div.merge.x = phi i32 [ %a, %entry ], [ %uni.merge.h, %H ] ; temporal divergent phi + br i1 %uni.cond, label %Y, label %exit +; CHECK: DIVERGENT: %div.merge.x = + +Y: + %div.merge.y = phi i32 [ 42, %X ], [ %b, %B ] + br label %exit +; CHECK: DIVERGENT: %div.merge.y = + +exit: + %div.merge.exit = phi i32 [ %a, %X ], [ %b, %Y ] + ret void +; CHECK: DIVERGENT: %div.merge.exit = +} + +; divergent loop (H
, B) +; the phi nodes in X and Y don't actually receive divergent values +define amdgpu_kernel void @unobserved_loop_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'unobserved_loop_diverge': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br i1 %uni.cond, label %X, label %H ; uniform + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %B ] + %div.exitx = icmp slt i32 %tid, 0 + br i1 %div.exitx, label %X, label %B ; divergent branch +; CHECK: DIVERGENT: %div.exitx = +; CHECK: DIVERGENT: br i1 %div.exitx, + +B: + %uni.inc = add i32 %uni.merge.h, 1 + %div.exity = icmp sgt i32 %tid, 0 + br i1 %div.exity, label %Y, label %H ; divergent branch +; CHECK: DIVERGENT: %div.exity = +; CHECK: DIVERGENT: br i1 %div.exity, + +X: + %uni.merge.x = phi i32 [ %a, %entry ], [ %b, %H ] + br label %exit + +Y: + %uni.merge.y = phi i32 [ %b, %B ] + br label %exit + +exit: + %div.merge.exit = phi i32 [ %a, %X ], [ %b, %Y ] + ret void +; CHECK: DIVERGENT: %div.merge.exit = +} + +; divergent loop (G
, L) inside divergent loop (H
, B, C, D, G, L) +; the inner loop has no exit to top level. +; the outer loop becomes divergent as its exiting branch in C is control-dependent on the inner loop's divergent loop exit in D. +define amdgpu_kernel void @hidden_nestedloop_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_nestedloop_diverge': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %uni.cond, label %X, label %H + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %D ] + br i1 %uni.cond, label %G, label %B +; CHECK: DIVERGENT: %div.exitx = + +B: + br i1 %uni.cond, label %X, label %C + +C: + br i1 %uni.cond, label %Y, label %D + +D: + %uni.inc = add i32 %uni.merge.h, 1 + br label %H + +G: + br i1 %div.exitx, label %C, label %L +; CHECK: DIVERGENT: br i1 %div.exitx, + +L: + br i1 %uni.cond, label %D, label %G + +X: + %div.merge.x = phi i32 [ %a, %entry ], [ %uni.merge.h, %B ] ; temporal divergent phi + br i1 %uni.cond, label %Y, label %exit +; CHECK: DIVERGENT: %div.merge.x = + +Y: + %div.merge.y = phi i32 [ 42, %X ], [ %b, %C ] + br label %exit +; CHECK: DIVERGENT: %div.merge.y = + +exit: + %div.merge.exit = phi i32 [ %a, %X ], [ %b, %Y ] + ret void +; CHECK: DIVERGENT: %div.merge.exit = +} + +; divergent loop (G
, L) in divergent loop (H
, B, C, G, L) +; the outer loop has no immediately divergent exiting edge. +; the inner exiting edge is exiting to top-level through the outer loop causing both to become divergent. +define amdgpu_kernel void @hidden_doublebreak_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_doublebreak_diverge': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %uni.cond, label %X, label %H + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %C ] + br i1 %uni.cond, label %G, label %B +; CHECK: DIVERGENT: %div.exitx = + +B: + br i1 %uni.cond, label %Y, label %C + +C: + %uni.inc = add i32 %uni.merge.h, 1 + br label %H + +G: + br i1 %div.exitx, label %X, label %L ; two-level break +; CHECK: DIVERGENT: br i1 %div.exitx, + +L: + br i1 %uni.cond, label %C, label %G + +X: + %div.merge.x = phi i32 [ %a, %entry ], [ %uni.merge.h, %G ] ; temporal divergence + br label %Y +; CHECK: DIVERGENT: %div.merge.x = + +Y: + %div.merge.y = phi i32 [ 42, %X ], [ %b, %B ] + ret void +; CHECK: DIVERGENT: %div.merge.y = +} + +; divergent loop (G
, L) contained inside a uniform loop (H
, B, G, L , D) +define amdgpu_kernel void @hidden_containedloop_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_containedloop_diverge': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %uni.cond, label %X, label %H + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc.d, %D ] + br i1 %uni.cond, label %G, label %B +; CHECK: DIVERGENT: %div.exitx = + +B: + %div.merge.b = phi i32 [ 42, %H ], [ %uni.merge.g, %G ] + br label %D +; CHECK: DIVERGENT: %div.merge.b = + +G: + %uni.merge.g = phi i32 [ 123, %H ], [ %uni.inc.l, %L ] + br i1 %div.exitx, label %B, label %L +; CHECK: DIVERGENT: br i1 %div.exitx, + +L: + %uni.inc.l = add i32 %uni.merge.g, 1 + br i1 %uni.cond, label %G, label %D + +D: + %uni.inc.d = add i32 %uni.merge.h, 1 + br i1 %uni.cond, label %X, label %H + +X: + %uni.merge.x = phi i32 [ %a, %entry ], [ %uni.inc.d, %D ] + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } Index: test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll @@ -0,0 +1,13 @@ +; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +; CHECK: DIVERGENT: %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0 +define amdgpu_kernel void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) #0 { + %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0 + store i32 %swizzle, i32 addrspace(1)* %out, align 4 + ret void +} + +declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1 + +attributes #0 = { nounwind convergent } +attributes #1 = { nounwind readnone convergent } Index: test/Analysis/DivergenceAnalysis/AMDGPU/irreducible.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/irreducible.ll @@ -0,0 +1,48 @@ +; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +; This test contains an unstructured loop. +; +-------------- entry ----------------+ +; | | +; V V +; i1 = phi(0, i3) i2 = phi(0, i3) +; j1 = i1 + 1 ---> i3 = phi(j1, j2) <--- j2 = i2 + 2 +; ^ | ^ +; | V | +; +-------- switch (tid / i3) ----------+ +; | +; V +; if (i3 == 5) // divergent +; because sync dependent on (tid / i3). +define i32 @unstructured_loop(i1 %entry_cond) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'unstructured_loop' +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + br i1 %entry_cond, label %loop_entry_1, label %loop_entry_2 +loop_entry_1: + %i1 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ] + %j1 = add i32 %i1, 1 + br label %loop_body +loop_entry_2: + %i2 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ] + %j2 = add i32 %i2, 2 + br label %loop_body +loop_body: + %i3 = phi i32 [ %j1, %loop_entry_1 ], [ %j2, %loop_entry_2 ] + br label %loop_latch +loop_latch: + %div = sdiv i32 %tid, %i3 + switch i32 %div, label %branch [ i32 1, label %loop_entry_1 + i32 2, label %loop_entry_2 ] +branch: + %cmp = icmp eq i32 %i3, 5 + br i1 %cmp, label %then, label %else +; CHECK: DIVERGENT: br i1 %cmp, +then: + ret i32 0 +else: + ret i32 1 +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } Index: test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll @@ -0,0 +1,41 @@ +; RUN: opt %s -mtriple amdgcn-- -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s + +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'test_amdgpu_ps': +; CHECK: DIVERGENT: +; CHECK-NOT: %arg0 +; CHECK-NOT: %arg1 +; CHECK-NOT: %arg2 +; CHECK: <2 x i32> %arg3 +; CHECK: DIVERGENT: <3 x i32> %arg4 +; CHECK: DIVERGENT: float %arg5 +; CHECK: DIVERGENT: i32 %arg6 + +define amdgpu_ps void @test_amdgpu_ps([4 x <16 x i8>] addrspace(2)* byval %arg0, float inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <3 x i32> %arg4, float %arg5, i32 %arg6) #0 { + ret void +} + +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'test_amdgpu_kernel': +; CHECK-NOT: %arg0 +; CHECK-NOT: %arg1 +; CHECK-NOT: %arg2 +; CHECK-NOT: %arg3 +; CHECK-NOT: %arg4 +; CHECK-NOT: %arg5 +; CHECK-NOT: %arg6 +define amdgpu_kernel void @test_amdgpu_kernel([4 x <16 x i8>] addrspace(2)* byval %arg0, float inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <3 x i32> %arg4, float %arg5, i32 %arg6) #0 { + ret void +} + +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'test_c': +; CHECK: DIVERGENT: +; CHECK: DIVERGENT: +; CHECK: DIVERGENT: +; CHECK: DIVERGENT: +; CHECK: DIVERGENT: +; CHECK: DIVERGENT: +; CHECK: DIVERGENT: +define void @test_c([4 x <16 x i8>] addrspace(2)* byval %arg0, float inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <3 x i32> %arg4, float %arg5, i32 %arg6) #0 { + ret void +} + +attributes #0 = { nounwind } Index: test/Analysis/DivergenceAnalysis/AMDGPU/lit.local.cfg =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'AMDGPU' in config.root.targets: + config.unsupported = True Index: test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll @@ -0,0 +1,103 @@ +;RUN: opt -mtriple=amdgcn-mesa-mesa3d -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.swap( +define float @buffer_atomic_swap(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.add( +define float @buffer_atomic_add(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.add(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.sub( +define float @buffer_atomic_sub(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.sub(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.smin( +define float @buffer_atomic_smin(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.smin(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.umin( +define float @buffer_atomic_umin(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.umin(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.smax( +define float @buffer_atomic_smax(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.smax(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.umax( +define float @buffer_atomic_umax(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.umax(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.and( +define float @buffer_atomic_and(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.and(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.or( +define float @buffer_atomic_or(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.or(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.xor( +define float @buffer_atomic_xor(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.xor(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.cmpswap( +define float @buffer_atomic_cmpswap(<4 x i32> inreg %rsrc, i32 inreg %data, i32 inreg %cmp) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +declare i32 @llvm.amdgcn.buffer.atomic.swap(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.add(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.smin(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.umin(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.smax(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.umax(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.and(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.or(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.xor(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i1) #0 + +attributes #0 = { nounwind } Index: test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll @@ -0,0 +1,131 @@ +;RUN: opt -mtriple=amdgcn-mesa-mesa3d -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32( +define float @image_atomic_swap(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32( +define float @image_atomic_add(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32( +define float @image_atomic_sub(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32( +define float @image_atomic_smin(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32( +define float @image_atomic_umin(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32( +define float @image_atomic_smax(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32( +define float @image_atomic_umax(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32( +define float @image_atomic_and(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32( +define float @image_atomic_or(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32( +define float @image_atomic_xor(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32( +define float @image_atomic_inc(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32( +define float @image_atomic_dec(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32( +define float @image_atomic_cmpswap(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data, i32 inreg %cmp) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %data, i32 %cmp, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32( +define float @image_atomic_add_2d(<8 x i32> inreg %rsrc, i32 inreg %s, i32 inreg %t, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +declare i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #0 + +declare i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #0 + +attributes #0 = { nounwind } Index: test/Analysis/DivergenceAnalysis/AMDGPU/loads.ll =================================================================== --- test/Analysis/DivergenceAnalysis/AMDGPU/loads.ll +++ test/Analysis/DivergenceAnalysis/AMDGPU/loads.ll @@ -1,4 +1,4 @@ -; RUN: opt -mtriple=amdgcn-- -analyze -divergence %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s ; Test that we consider loads from flat and private addrspaces to be divergent. Index: test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll @@ -0,0 +1,30 @@ +; RUN: opt %s -mtriple amdgcn-- -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s + +; CHECK: DIVERGENT: %tmp5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp2 +; CHECK: DIVERGENT: %tmp10 = load volatile float, float addrspace(1)* %tmp5, align 4 +; CHECK: DIVERGENT: %tmp11 = load volatile float, float addrspace(1)* %tmp5, align 4 + +; The post dominator tree does not have a root node in this case +define amdgpu_kernel void @no_return_blocks(float addrspace(1)* noalias nocapture readonly %arg, float addrspace(1)* noalias nocapture readonly %arg1) #0 { +bb0: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %tmp2 = sext i32 %tmp to i64 + %tmp5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp2 + %tmp6 = load volatile float, float addrspace(1)* %tmp5, align 4 + %tmp8 = fcmp olt float %tmp6, 0.000000e+00 + br i1 %tmp8, label %bb1, label %bb2 + +bb1: + %tmp10 = load volatile float, float addrspace(1)* %tmp5, align 4 + br label %bb2 + +bb2: + %tmp11 = load volatile float, float addrspace(1)* %tmp5, align 4 + br label %bb1 +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } Index: test/Analysis/DivergenceAnalysis/AMDGPU/phi-undef.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/phi-undef.ll @@ -0,0 +1,31 @@ +; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +; CHECK-LABEL: 'test1': +; CHECK-NEXT: DIVERGENT: i32 %bound +; CHECK: {{^ *}}%counter = +; CHECK-NEXT: DIVERGENT: %break = icmp sge i32 %counter, %bound +; CHECK-NEXT: DIVERGENT: br i1 %break, label %footer, label %body +; CHECK: {{^ *}}%counter.next = +; CHECK: {{^ *}}%counter.footer = +; CHECK: DIVERGENT: br i1 %break, label %end, label %header +; Note: %counter is not divergent! +define amdgpu_ps void @test1(i32 %bound) { +entry: + br label %header + +header: + %counter = phi i32 [ 0, %entry ], [ %counter.footer, %footer ] + %break = icmp sge i32 %counter, %bound + br i1 %break, label %footer, label %body + +body: + %counter.next = add i32 %counter, 1 + br label %footer + +footer: + %counter.footer = phi i32 [ %counter.next, %body ], [ undef, %header ] + br i1 %break, label %end, label %header + +end: + ret void +} Index: test/Analysis/DivergenceAnalysis/AMDGPU/temporal_diverge.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/temporal_diverge.ll @@ -0,0 +1,154 @@ +; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +; temporal-divergent use of value carried by divergent loop +define amdgpu_kernel void @temporal_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'temporal_diverge': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br label %H + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %H ] + %uni.inc = add i32 %uni.merge.h, 1 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %div.exitx, label %X, label %H ; divergent branch +; CHECK: DIVERGENT: %div.exitx = +; CHECK: DIVERGENT: br i1 %div.exitx, + +X: + %div.user = add i32 %uni.inc, 5 + ret void +} + +; temporal-divergent use of value carried by divergent loop inside a top-level loop +define amdgpu_kernel void @temporal_diverge_inloop(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'temporal_diverge_inloop': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br label %G + +G: + br label %H + +H: + %uni.merge.h = phi i32 [ 0, %G ], [ %uni.inc, %H ] + %uni.inc = add i32 %uni.merge.h, 1 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %div.exitx, label %X, label %H ; divergent branch +; CHECK: DIVERGENT: %div.exitx = +; CHECK: DIVERGENT: br i1 %div.exitx, + +X: + %div.user = add i32 %uni.inc, 5 + br i1 %uni.cond, label %G, label %Y + +Y: + %div.alsouser = add i32 %uni.inc, 5 + ret void +} + + +; temporal-uniform use of a valud, definition and users are carried by a surrounding divergent loop +define amdgpu_kernel void @temporal_uniform_indivloop(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'temporal_uniform_indivloop': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br label %G + +G: + br label %H + +H: + %uni.merge.h = phi i32 [ 0, %G ], [ %uni.inc, %H ] + %uni.inc = add i32 %uni.merge.h, 1 + br i1 %uni.cond, label %X, label %H ; divergent branch + +X: + %uni.user = add i32 %uni.inc, 5 + %div.exity = icmp slt i32 %tid, 0 +; CHECK: DIVERGENT: %div.exity = + br i1 %div.exity, label %G, label %Y +; CHECK: DIVERGENT: br i1 %div.exity, + +Y: + %div.alsouser = add i32 %uni.inc, 5 + ret void +} + + +; temporal-divergent use of value carried by divergent loop, user is inside sibling loop +define amdgpu_kernel void @temporal_diverge_loopuser(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'temporal_diverge_loopuser': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br label %H + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %H ] + %uni.inc = add i32 %uni.merge.h, 1 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %div.exitx, label %X, label %H ; divergent branch +; CHECK: DIVERGENT: %div.exitx = +; CHECK: DIVERGENT: br i1 %div.exitx, + +X: + br label %G + +G: + %div.user = add i32 %uni.inc, 5 + br i1 %uni.cond, label %G, label %Y + +Y: + ret void +} + +; temporal-divergent use of value carried by divergent loop, user is inside sibling loop, defs and use are carried by a uniform loop +define amdgpu_kernel void @temporal_diverge_loopuser_nested(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'temporal_diverge_loopuser_nested': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br label %H + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %H ] + %uni.inc = add i32 %uni.merge.h, 1 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %div.exitx, label %X, label %H ; divergent branch +; CHECK: DIVERGENT: %div.exitx = +; CHECK: DIVERGENT: br i1 %div.exitx, + +X: + br label %G + +G: + %div.user = add i32 %uni.inc, 5 + br i1 %uni.cond, label %G, label %Y + +Y: + ret void +} + + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } Index: test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll @@ -0,0 +1,45 @@ +; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +declare i32 @llvm.amdgcn.workitem.id.x() #0 +declare i32 @llvm.amdgcn.workitem.id.y() #0 +declare i32 @llvm.amdgcn.workitem.id.z() #0 +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0 +declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0 + +; CHECK: DIVERGENT: %id.x = call i32 @llvm.amdgcn.workitem.id.x() +define amdgpu_kernel void @workitem_id_x() #1 { + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %id.x, i32 addrspace(1)* undef + ret void +} + +; CHECK: DIVERGENT: %id.y = call i32 @llvm.amdgcn.workitem.id.y() +define amdgpu_kernel void @workitem_id_y() #1 { + %id.y = call i32 @llvm.amdgcn.workitem.id.y() + store volatile i32 %id.y, i32 addrspace(1)* undef + ret void +} + +; CHECK: DIVERGENT: %id.z = call i32 @llvm.amdgcn.workitem.id.z() +define amdgpu_kernel void @workitem_id_z() #1 { + %id.z = call i32 @llvm.amdgcn.workitem.id.z() + store volatile i32 %id.z, i32 addrspace(1)* undef + ret void +} + +; CHECK: DIVERGENT: %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 0, i32 0) +define amdgpu_kernel void @mbcnt_lo() #1 { + %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 0, i32 0) + store volatile i32 %mbcnt.lo, i32 addrspace(1)* undef + ret void +} + +; CHECK: DIVERGENT: %mbcnt.hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0) +define amdgpu_kernel void @mbcnt_hi() #1 { + %mbcnt.hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0) + store volatile i32 %mbcnt.hi, i32 addrspace(1)* undef + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } Index: test/Analysis/DivergenceAnalysis/Loops/IndirectUniAccess.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/Loops/IndirectUniAccess.ll @@ -0,0 +1,74 @@ +; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s + +; CHECK: Divergence of loop for.body { +; CHECK-NEXT: DIVERGENT: %indvars.iv29 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next30, %for.cond.cleanup3 ] +; CHECK-NEXT: DIVERGENT: %x.0.lcssa = phi double [ 0.000000e+00, %for.body ], [ %add, %for.body4 ] +; CHECK-NEXT: DIVERGENT: %arrayidx10 = getelementptr inbounds double, double* %C, i64 %indvars.iv29 +; CHECK-NEXT: DIVERGENT: store double %x.0.lcssa, double* %arrayidx10, align 8 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next30 = add nuw nsw i64 %indvars.iv29, 1 +; CHECK-NEXT: DIVERGENT: %x.025 = phi double [ %add, %for.body4 ], [ 0.000000e+00, %for.body4.preheader ] +; CHECK-NEXT: DIVERGENT: %arrayidx8 = getelementptr inbounds double, double* %1, i64 %indvars.iv29 +; CHECK-NEXT: DIVERGENT: %2 = load double, double* %arrayidx8, align 8 +; CHECK-NEXT: DIVERGENT: %add = fadd double %x.025, %2 +; CHECK-NEXT: } +; CHECK: Divergence of loop for.body4 { +; CHECK-NEXT: DIVERGENT: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4 ], [ 0, %for.body4.preheader ] +; CHECK-NEXT: DIVERGENT: %x.025 = phi double [ %add, %for.body4 ], [ 0.000000e+00, %for.body4.preheader ] +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds i32, i32* %Index, i64 %indvars.iv +; CHECK-NEXT: DIVERGENT: %0 = load i32, i32* %arrayidx, align 4 +; CHECK-NEXT: DIVERGENT: %idxprom5 = sext i32 %0 to i64 +; CHECK-NEXT: DIVERGENT: %arrayidx6 = getelementptr inbounds double*, double** %A, i64 %idxprom5 +; CHECK-NEXT: DIVERGENT: %1 = load double*, double** %arrayidx6, align 8 +; CHECK-NEXT: DIVERGENT: %arrayidx8 = getelementptr inbounds double, double* %1, i64 %indvars.iv29 +; CHECK-NEXT: DIVERGENT: %2 = load double, double* %arrayidx8, align 8 +; CHECK-NEXT: DIVERGENT: %add = fadd double %x.025, %2 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-NEXT: } + +; Function Attrs: norecurse nounwind uwtable +define void @test(i32* nocapture readonly %Index, double** nocapture readonly %A, double* nocapture %C, i32 %m, i32 %n) #0 { +entry: + %cmp27 = icmp sgt i32 %n, 0 + br i1 %cmp27, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %cmp224 = icmp sgt i32 %m, 0 + %wide.trip.count = zext i32 %m to i64 + %wide.trip.count31 = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup3, %entry + ret void + +for.body: ; preds = %for.cond.cleanup3, %for.body.lr.ph + %indvars.iv29 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next30, %for.cond.cleanup3 ] + br i1 %cmp224, label %for.body4.preheader, label %for.cond.cleanup3 + +for.body4.preheader: ; preds = %for.body + br label %for.body4 + +for.cond.cleanup3: ; preds = %for.body4, %for.body + %x.0.lcssa = phi double [ 0.000000e+00, %for.body ], [ %add, %for.body4 ] + %arrayidx10 = getelementptr inbounds double, double* %C, i64 %indvars.iv29 + store double %x.0.lcssa, double* %arrayidx10, align 8 + %indvars.iv.next30 = add nuw nsw i64 %indvars.iv29, 1 + %exitcond32 = icmp eq i64 %indvars.iv.next30, %wide.trip.count31 + br i1 %exitcond32, label %for.cond.cleanup, label %for.body + +for.body4: ; preds = %for.body4.preheader, %for.body4 + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4 ], [ 0, %for.body4.preheader ] + %x.025 = phi double [ %add, %for.body4 ], [ 0.000000e+00, %for.body4.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %Index, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %idxprom5 = sext i32 %0 to i64 + %arrayidx6 = getelementptr inbounds double*, double** %A, i64 %idxprom5 + %1 = load double*, double** %arrayidx6, align 8 + %arrayidx8 = getelementptr inbounds double, double* %1, i64 %indvars.iv29 + %2 = load double, double* %arrayidx8, align 8 + %add = fadd double %x.025, %2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup3, label %for.body4 +} + +attributes #0 = { norecurse nounwind } Index: test/Analysis/DivergenceAnalysis/Loops/LoopWithDivBranch.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/Loops/LoopWithDivBranch.ll @@ -0,0 +1,44 @@ +; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s + +; CHECK: Divergence of loop for.body { +; CHECK-NEXT: DIVERGENT: %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %B ] +; CHECK-NEXT: DIVERGENT: %hfreq = srem i64 %indvars.iv, 2 +; CHECK-NEXT: DIVERGENT: %toggle = trunc i64 %hfreq to i1 +; CHECK-NEXT: DIVERGENT: br i1 %toggle, label %A, label %B +; CHECK-NEXT: DIVERGENT: %divphi = phi float [ %cast, %A ], [ 4.200000e+01, %for.body ] +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds float, float* %ptr, i64 %indvars.iv +; CHECK-NEXT: DIVERGENT: store float %divphi, float* %arrayidx, align 4 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-NEXT: } +define void @test1(float* nocapture %ptr, i64 %n) #0 { + entry: + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %B ] + %hfreq = srem i64 %indvars.iv, 2 + %toggle = trunc i64 %hfreq to i1 + br i1 %toggle, label %A, label %B + +A: + %trunc = trunc i64 %n to i32 + %cast = sitofp i32 %trunc to float + br label %B + +B: + %divphi = phi float [ %cast, %A ], [ 4.200000e+01, %for.body ] + %arrayidx = getelementptr inbounds float, float* %ptr, i64 %indvars.iv + store float %divphi, float* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +attributes #0 = { nounwind } Index: test/Analysis/DivergenceAnalysis/Loops/LoopWithDivLoop.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/Loops/LoopWithDivLoop.ll @@ -0,0 +1,60 @@ +; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s + +; CHECK: Printing analysis 'Loop Divergence Printer' for function 'test1': +; CHECK-NEXT: Divergence of loop for.body { +; CHECK-NEXT: DIVERGENT: %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.latch ] +; CHECK-NEXT: DIVERGENT: %row = mul i64 %n, %indvars.iv +; CHECK-NEXT: DIVERGENT: %idx = add i64 %row, %indvars.iv2 +; CHECK-NEXT: DIVERGENT: %trunc = trunc i64 %idx to i32 +; CHECK-NEXT: DIVERGENT: %val = sitofp i32 %trunc to float +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds float, float* %ptr, i64 %idx +; CHECK-NEXT: DIVERGENT: store float %val, float* %arrayidx, align 4 +; CHECK-NEXT: DIVERGENT: %exitcond2 = icmp sge i64 %indvars.iv.next2, %indvars.iv +; CHECK-NEXT: DIVERGENT: br i1 %exitcond2, label %for.latch, label %for.body2 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-NEXT: } +; CHECK: Divergence of loop for.body2 { +; CHECK-NEXT: DIVERGENT: %indvars.iv2 = phi i64 [ 0, %for.body ], [ %indvars.iv.next2, %for.body2 ] +; CHECK-NEXT: DIVERGENT: %idx = add i64 %row, %indvars.iv2 +; CHECK-NEXT: DIVERGENT: %trunc = trunc i64 %idx to i32 +; CHECK-NEXT: DIVERGENT: %val = sitofp i32 %trunc to float +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds float, float* %ptr, i64 %idx +; CHECK-NEXT: DIVERGENT: store float %val, float* %arrayidx, align 4 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next2 = add nuw nsw i64 %indvars.iv2, 1 +; CHECK-NEXT: } + +define void @test1(float* nocapture %ptr, i64 %n) #0 { + entry: + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body.lr.ph, label %exit + +for.body.lr.ph: ; preds = %entry + br label %for.body + +exit: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.latch ] + br label %for.body2 + +for.body2: + %indvars.iv2 = phi i64 [ 0, %for.body ], [ %indvars.iv.next2, %for.body2 ] + %row = mul i64 %n, %indvars.iv + %idx = add i64 %row, %indvars.iv2 + %trunc = trunc i64 %idx to i32 + %val = sitofp i32 %trunc to float + %arrayidx = getelementptr inbounds float, float* %ptr, i64 %idx + store float %val, float* %arrayidx, align 4 + %indvars.iv.next2 = add nuw nsw i64 %indvars.iv2, 1 + %exitcond2 = icmp sge i64 %indvars.iv.next2, %indvars.iv + br i1 %exitcond2, label %for.latch, label %for.body2 + +for.latch: + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond, label %exit, label %for.body +} + +attributes #0 = { nounwind } + Index: test/Analysis/DivergenceAnalysis/Loops/LoopWithLI.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/Loops/LoopWithLI.ll @@ -0,0 +1,31 @@ +; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s + +; CHECK: Divergence of loop for.body { +; CHECK-NEXT: DIVERGENT: %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv +; CHECK-NEXT: DIVERGENT: store float %cast, float* %arrayidx, align 4 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-NEXT: } +define void @test1(float* nocapture %A, i64 %n) #0 { + entry: + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv + %trunc = trunc i64 %n to i32 + %cast = sitofp i32 %trunc to float + store float %cast, float* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +attributes #0 = { nounwind } Index: test/Analysis/DivergenceAnalysis/Loops/LoopWithUniBranch.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/Loops/LoopWithUniBranch.ll @@ -0,0 +1,39 @@ +; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s + +; CHECK: Divergence of loop for.body { +; CHECK-NEXT: DIVERGENT: %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %B ] +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds float, float* %ptr, i64 %indvars.iv +; CHECK-NEXT: DIVERGENT: store float %divphi, float* %arrayidx, align 4 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-NEXT: } +define void @test1(float* nocapture %ptr, i64 %n) #0 { + entry: + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %B ] + %invar = trunc i64 %n to i1 + br i1 %invar, label %A, label %B + +A: + %trunc = trunc i64 %n to i32 + %cast = sitofp i32 %trunc to float + br label %B + +B: + %divphi = phi float [ %cast, %A ], [ 4.200000e+01, %for.body ] + %arrayidx = getelementptr inbounds float, float* %ptr, i64 %indvars.iv + store float %divphi, float* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +attributes #0 = { nounwind } Index: test/Analysis/DivergenceAnalysis/Loops/LoopWithUniLoop.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/Loops/LoopWithUniLoop.ll @@ -0,0 +1,55 @@ +; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s + +; CHECK: Divergence of loop for.body { +; CHECK-NEXT: DIVERGENT: %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.latch ] +; CHECK-NEXT: DIVERGENT: %row = mul i64 %n, %indvars.iv +; CHECK-NEXT: DIVERGENT: %idx = add i64 %row, %indvars.iv2 +; CHECK-NEXT: DIVERGENT: %trunc = trunc i64 %idx to i32 +; CHECK-NEXT: DIVERGENT: %val = sitofp i32 %trunc to float +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds float, float* %ptr, i64 %idx +; CHECK-NEXT: DIVERGENT: store float %val, float* %arrayidx, align 4 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-NEXT: } +; CHECK: Divergence of loop for.body2 { +; CHECK-NEXT: DIVERGENT: %indvars.iv2 = phi i64 [ 0, %for.body ], [ %indvars.iv.next2, %for.body2 ] +; CHECK-NEXT: DIVERGENT: %idx = add i64 %row, %indvars.iv2 +; CHECK-NEXT: DIVERGENT: %trunc = trunc i64 %idx to i32 +; CHECK-NEXT: DIVERGENT: %val = sitofp i32 %trunc to float +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds float, float* %ptr, i64 %idx +; CHECK-NEXT: DIVERGENT: store float %val, float* %arrayidx, align 4 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next2 = add nuw nsw i64 %indvars.iv2, 1 +; CHECK-NEXT: } +define void @test1(float* nocapture %ptr, i64 %n) #0 { + entry: + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body.lr.ph, label %exit + +for.body.lr.ph: ; preds = %entry + br label %for.body + +exit: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.latch ] + br label %for.body2 + +for.body2: + %indvars.iv2 = phi i64 [ 0, %for.body ], [ %indvars.iv.next2, %for.body2 ] + %row = mul i64 %n, %indvars.iv + %idx = add i64 %row, %indvars.iv2 + %trunc = trunc i64 %idx to i32 + %val = sitofp i32 %trunc to float + %arrayidx = getelementptr inbounds float, float* %ptr, i64 %idx + store float %val, float* %arrayidx, align 4 + %indvars.iv.next2 = add nuw nsw i64 %indvars.iv2, 1 + %exitcond2 = icmp eq i64 %indvars.iv.next2, %n + br i1 %exitcond2, label %for.latch, label %for.body2 + +for.latch: + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond, label %exit, label %for.body +} + +attributes #0 = { nounwind } Index: test/Analysis/DivergenceAnalysis/Loops/NonAffineUniLoop.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/Loops/NonAffineUniLoop.ll @@ -0,0 +1,110 @@ +; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s + +; CHECK: Divergence of loop for.body { +; CHECK-NEXT: DIVERGENT: %indvars.iv53 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next54, %for.cond.cleanup3 ] +; CHECK-NEXT: DIVERGENT: %indvars.iv.next54 = add nuw nsw i64 %indvars.iv53, 1 +; CHECK-NEXT: DIVERGENT: %5 = add nsw i64 %4, %indvars.iv53 +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds double, double* %A, i64 %5 +; CHECK-NEXT: DIVERGENT: %6 = load double, double* %arrayidx, align 8 +; CHECK-NEXT: DIVERGENT: %8 = add nsw i64 %7, %indvars.iv53 +; CHECK-NEXT: DIVERGENT: %arrayidx14 = getelementptr inbounds double, double* %A, i64 %8 +; CHECK-NEXT: DIVERGENT: %9 = load double, double* %arrayidx14, align 8 +; CHECK-NEXT: DIVERGENT: %add15 = fadd double %6, %9 +; CHECK-NEXT: DIVERGENT: store double %add15, double* %arrayidx14, align 8 +; CHECK-NEXT: } +; CHECK: Divergence of loop for.body8.lr.ph { +; CHECK-NEXT: DIVERGENT: %mul44 = phi i32 [ %mul, %for.cond.cleanup7 ], [ 2, %for.body8.lr.ph.preheader ] +; CHECK-NEXT: DIVERGENT: %len.043 = phi i32 [ %mul44, %for.cond.cleanup7 ], [ 1, %for.body8.lr.ph.preheader ] +; CHECK-NEXT: DIVERGENT: %1 = sext i32 %mul44 to i64 +; CHECK-NEXT: DIVERGENT: %2 = sext i32 %len.043 to i64 +; CHECK-NEXT: DIVERGENT: %mul = shl nsw i32 %mul44, 1 +; CHECK-NEXT: DIVERGENT: %indvars.iv = phi i64 [ 0, %for.body8.lr.ph ], [ %indvars.iv.next, %for.body8 ] +; CHECK-NEXT: DIVERGENT: %3 = add nsw i64 %indvars.iv, %2 +; CHECK-NEXT: DIVERGENT: %4 = mul nsw i64 %3, %0 +; CHECK-NEXT: DIVERGENT: %5 = add nsw i64 %4, %indvars.iv53 +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds double, double* %A, i64 %5 +; CHECK-NEXT: DIVERGENT: %6 = load double, double* %arrayidx, align 8 +; CHECK-NEXT: DIVERGENT: %7 = mul nsw i64 %indvars.iv, %0 +; CHECK-NEXT: DIVERGENT: %8 = add nsw i64 %7, %indvars.iv53 +; CHECK-NEXT: DIVERGENT: %arrayidx14 = getelementptr inbounds double, double* %A, i64 %8 +; CHECK-NEXT: DIVERGENT: %9 = load double, double* %arrayidx14, align 8 +; CHECK-NEXT: DIVERGENT: %add15 = fadd double %6, %9 +; CHECK-NEXT: DIVERGENT: store double %add15, double* %arrayidx14, align 8 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next = add i64 %indvars.iv, %1 +; CHECK-NEXT: DIVERGENT: %cmp6 = icmp slt i64 %indvars.iv.next, %0 +; CHECK-NEXT: DIVERGENT: br i1 %cmp6, label %for.body8, label %for.cond.cleanup7 +; CHECK-NEXT: } +; CHECK: Divergence of loop for.body8 { +; CHECK-NEXT: DIVERGENT: %indvars.iv = phi i64 [ 0, %for.body8.lr.ph ], [ %indvars.iv.next, %for.body8 ] +; CHECK-NEXT: DIVERGENT: %3 = add nsw i64 %indvars.iv, %2 +; CHECK-NEXT: DIVERGENT: %4 = mul nsw i64 %3, %0 +; CHECK-NEXT: DIVERGENT: %5 = add nsw i64 %4, %indvars.iv53 +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds double, double* %A, i64 %5 +; CHECK-NEXT: DIVERGENT: %6 = load double, double* %arrayidx, align 8 +; CHECK-NEXT: DIVERGENT: %7 = mul nsw i64 %indvars.iv, %0 +; CHECK-NEXT: DIVERGENT: %8 = add nsw i64 %7, %indvars.iv53 +; CHECK-NEXT: DIVERGENT: %arrayidx14 = getelementptr inbounds double, double* %A, i64 %8 +; CHECK-NEXT: DIVERGENT: %9 = load double, double* %arrayidx14, align 8 +; CHECK-NEXT: DIVERGENT: %add15 = fadd double %6, %9 +; CHECK-NEXT: DIVERGENT: store double %add15, double* %arrayidx14, align 8 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next = add i64 %indvars.iv, %1 +; CHECK-NEXT: } + +; Function Attrs: norecurse nounwind uwtable +define void @foo(double* nocapture %A, i32 %n) local_unnamed_addr #0 { +entry: + %cmp45 = icmp sgt i32 %n, 0 + br i1 %cmp45, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %cmp242 = icmp sgt i32 %n, 2 + %0 = sext i32 %n to i64 + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup3, %entry + ret void + +for.body: ; preds = %for.cond.cleanup3, %for.body.lr.ph + %indvars.iv53 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next54, %for.cond.cleanup3 ] + br i1 %cmp242, label %for.body8.lr.ph.preheader, label %for.cond.cleanup3 + +for.body8.lr.ph.preheader: ; preds = %for.body + br label %for.body8.lr.ph + +for.cond.cleanup3: ; preds = %for.cond.cleanup7, %for.body + %indvars.iv.next54 = add nuw nsw i64 %indvars.iv53, 1 + %exitcond = icmp eq i64 %indvars.iv.next54, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.body8.lr.ph: ; preds = %for.body8.lr.ph.preheader, %for.cond.cleanup7 + %mul44 = phi i32 [ %mul, %for.cond.cleanup7 ], [ 2, %for.body8.lr.ph.preheader ] + %len.043 = phi i32 [ %mul44, %for.cond.cleanup7 ], [ 1, %for.body8.lr.ph.preheader ] + %1 = sext i32 %mul44 to i64 + %2 = sext i32 %len.043 to i64 + br label %for.body8 + +for.cond.cleanup7: ; preds = %for.body8 + %mul = shl nsw i32 %mul44, 1 + %cmp2 = icmp slt i32 %mul, %n + br i1 %cmp2, label %for.body8.lr.ph, label %for.cond.cleanup3 + +for.body8: ; preds = %for.body8.lr.ph, %for.body8 + %indvars.iv = phi i64 [ 0, %for.body8.lr.ph ], [ %indvars.iv.next, %for.body8 ] + %3 = add nsw i64 %indvars.iv, %2 + %4 = mul nsw i64 %3, %0 + %5 = add nsw i64 %4, %indvars.iv53 + %arrayidx = getelementptr inbounds double, double* %A, i64 %5 + %6 = load double, double* %arrayidx, align 8 + %7 = mul nsw i64 %indvars.iv, %0 + %8 = add nsw i64 %7, %indvars.iv53 + %arrayidx14 = getelementptr inbounds double, double* %A, i64 %8 + %9 = load double, double* %arrayidx14, align 8 + %add15 = fadd double %6, %9 + store double %add15, double* %arrayidx14, align 8 + %indvars.iv.next = add i64 %indvars.iv, %1 + %cmp6 = icmp slt i64 %indvars.iv.next, %0 + br i1 %cmp6, label %for.body8, label %for.cond.cleanup7 +} + +attributes #0 = { norecurse nounwind uwtable } Index: test/Analysis/DivergenceAnalysis/Loops/SingleBlockLoop.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/Loops/SingleBlockLoop.ll @@ -0,0 +1,29 @@ +; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s + +; CHECK: Divergence of loop for.body { +; CHECK-NEXT: DIVERGENT: %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv +; CHECK-NEXT: DIVERGENT: store float 4.200000e+01, float* %arrayidx, align 4 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-NEXT: } +define void @test1(float* nocapture %A, i64 %n) #0 { + entry: + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv + store float 4.200000e+01, float* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +attributes #0 = { nounwind } Index: test/Analysis/DivergenceAnalysis/NVPTX/daorder.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/NVPTX/daorder.ll @@ -0,0 +1,47 @@ +; RUN: opt %s -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +define i32 @daorder(i32 %n) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'daorder' +entry: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %cond = icmp slt i32 %tid, 0 + br i1 %cond, label %A, label %B ; divergent +; CHECK: DIVERGENT: br i1 %cond, +A: + %defAtA = add i32 %n, 1 ; uniform +; CHECK-NOT: DIVERGENT: %defAtA = + br label %C +B: + %defAtB = add i32 %n, 2 ; uniform +; CHECK-NOT: DIVERGENT: %defAtB = + br label %C +C: + %defAtC = phi i32 [ %defAtA, %A ], [ %defAtB, %B ] ; divergent +; CHECK: DIVERGENT: %defAtC = + br label %D + +D: + %i = phi i32 [0, %C], [ %i.inc, %E ] ; uniform +; CHECK-NOT: DIVERGENT: %i = phi + br label %E + +E: + %i.inc = add i32 %i, 1 + %loopCnt = icmp slt i32 %i.inc, %n +; CHECK-NOT: DIVERGENT: %loopCnt = + br i1 %loopCnt, label %D, label %exit + +exit: + ret i32 %n +} + +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() +declare i32 @llvm.nvvm.read.ptx.sreg.laneid() + +!nvvm.annotations = !{!0} +!0 = !{i32 (i32)* @daorder, !"kernel", i32 1} Index: test/Analysis/DivergenceAnalysis/NVPTX/diverge.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/NVPTX/diverge.ll @@ -0,0 +1,175 @@ +; RUN: opt %s -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; return (n < 0 ? a + threadIdx.x : b + threadIdx.x) +define i32 @no_diverge(i32 %n, i32 %a, i32 %b) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'no_diverge' +entry: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %cond = icmp slt i32 %n, 0 + br i1 %cond, label %then, label %else ; uniform +; CHECK-NOT: DIVERGENT: br i1 %cond, +then: + %a1 = add i32 %a, %tid + br label %merge +else: + %b2 = add i32 %b, %tid + br label %merge +merge: + %c = phi i32 [ %a1, %then ], [ %b2, %else ] + ret i32 %c +} + +; c = a; +; if (threadIdx.x < 5) // divergent: data dependent +; c = b; +; return c; // c is divergent: sync dependent +define i32 @sync(i32 %a, i32 %b) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'sync' +bb1: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.y() + %cond = icmp slt i32 %tid, 5 + br i1 %cond, label %bb2, label %bb3 +; CHECK: DIVERGENT: br i1 %cond, +bb2: + br label %bb3 +bb3: + %c = phi i32 [ %a, %bb1 ], [ %b, %bb2 ] ; sync dependent on tid +; CHECK: DIVERGENT: %c = + ret i32 %c +} + +; c = 0; +; if (threadIdx.x >= 5) { // divergent +; c = (n < 0 ? a : b); // c here is uniform because n is uniform +; } +; // c here is divergent because it is sync dependent on threadIdx.x >= 5 +; return c; +define i32 @mixed(i32 %n, i32 %a, i32 %b) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'mixed' +bb1: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.z() + %cond = icmp slt i32 %tid, 5 + br i1 %cond, label %bb6, label %bb2 +; CHECK: DIVERGENT: br i1 %cond, +bb2: + %cond2 = icmp slt i32 %n, 0 + br i1 %cond2, label %bb4, label %bb3 +bb3: + br label %bb5 +bb4: + br label %bb5 +bb5: + %c = phi i32 [ %a, %bb3 ], [ %b, %bb4 ] +; CHECK-NOT: DIVERGENT: %c = + br label %bb6 +bb6: + %c2 = phi i32 [ 0, %bb1], [ %c, %bb5 ] +; CHECK: DIVERGENT: %c2 = + ret i32 %c2 +} + +; We conservatively treats all parameters of a __device__ function as divergent. +define i32 @device(i32 %n, i32 %a, i32 %b) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'device' +; CHECK: DIVERGENT: i32 %n +; CHECK: DIVERGENT: i32 %a +; CHECK: DIVERGENT: i32 %b +entry: + %cond = icmp slt i32 %n, 0 + br i1 %cond, label %then, label %else +; CHECK: DIVERGENT: br i1 %cond, +then: + br label %merge +else: + br label %merge +merge: + %c = phi i32 [ %a, %then ], [ %b, %else ] + ret i32 %c +} + +; int i = 0; +; do { +; i++; // i here is uniform +; } while (i < laneid); +; return i == 10 ? 0 : 1; // i here is divergent +; +; The i defined in the loop is used outside. +define i32 @loop() { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'loop' +entry: + %laneid = call i32 @llvm.nvvm.read.ptx.sreg.laneid() + br label %loop +loop: + %i = phi i32 [ 0, %entry ], [ %i1, %loop ] +; CHECK-NOT: DIVERGENT: %i = + %i1 = add i32 %i, 1 + %exit_cond = icmp sge i32 %i1, %laneid + br i1 %exit_cond, label %loop_exit, label %loop +loop_exit: + %cond = icmp eq i32 %i, 10 + br i1 %cond, label %then, label %else +; CHECK: DIVERGENT: br i1 %cond, +then: + ret i32 0 +else: + ret i32 1 +} + +; Same as @loop, but the loop is in the LCSSA form. +define i32 @lcssa() { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'lcssa' +entry: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + br label %loop +loop: + %i = phi i32 [ 0, %entry ], [ %i1, %loop ] +; CHECK-NOT: DIVERGENT: %i = + %i1 = add i32 %i, 1 + %exit_cond = icmp sge i32 %i1, %tid + br i1 %exit_cond, label %loop_exit, label %loop +loop_exit: + %i.lcssa = phi i32 [ %i, %loop ] +; CHECK: DIVERGENT: %i.lcssa = + %cond = icmp eq i32 %i.lcssa, 10 + br i1 %cond, label %then, label %else +; CHECK: DIVERGENT: br i1 %cond, +then: + ret i32 0 +else: + ret i32 1 +} + +; Verifies sync-dependence is computed correctly in the absense of loops. +define i32 @sync_no_loop(i32 %arg) { +entry: + %0 = add i32 %arg, 1 + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %1 = icmp sge i32 %tid, 10 + br i1 %1, label %bb1, label %bb2 + +bb1: + br label %bb3 + +bb2: + br label %bb3 + +bb3: + %2 = add i32 %0, 2 + ; CHECK-NOT: DIVERGENT: %2 + ret i32 %2 +} + +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() +declare i32 @llvm.nvvm.read.ptx.sreg.laneid() + +!nvvm.annotations = !{!0, !1, !2, !3, !4} +!0 = !{i32 (i32, i32, i32)* @no_diverge, !"kernel", i32 1} +!1 = !{i32 (i32, i32)* @sync, !"kernel", i32 1} +!2 = !{i32 (i32, i32, i32)* @mixed, !"kernel", i32 1} +!3 = !{i32 ()* @loop, !"kernel", i32 1} +!4 = !{i32 (i32)* @sync_no_loop, !"kernel", i32 1} Index: test/Analysis/DivergenceAnalysis/NVPTX/hidden_diverge.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/NVPTX/hidden_diverge.ll @@ -0,0 +1,30 @@ +; RUN: opt %s -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +define i32 @hidden_diverge(i32 %n, i32 %a, i32 %b) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_diverge' +entry: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %cond.var = icmp slt i32 %tid, 0 + br i1 %cond.var, label %B, label %C ; divergent +; CHECK: DIVERGENT: br i1 %cond.var, +B: + %cond.uni = icmp slt i32 %n, 0 + br i1 %cond.uni, label %C, label %merge ; uniform +; CHECK-NOT: DIVERGENT: br i1 %cond.uni, +C: + %phi.var.hidden = phi i32 [ 1, %entry ], [ 2, %B ] +; CHECK: DIVERGENT: %phi.var.hidden = phi i32 + br label %merge +merge: + %phi.ipd = phi i32 [ %a, %B ], [ %b, %C ] +; CHECK: DIVERGENT: %phi.ipd = phi i32 + ret i32 %phi.ipd +} + +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() + +!nvvm.annotations = !{!0} +!0 = !{i32 (i32, i32, i32)* @hidden_diverge, !"kernel", i32 1} Index: test/Analysis/DivergenceAnalysis/NVPTX/irreducible.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/NVPTX/irreducible.ll @@ -0,0 +1,55 @@ +; RUN: opt %s -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; This test contains an unstructured loop. +; +-------------- entry ----------------+ +; | | +; V V +; i1 = phi(0, i3) i2 = phi(0, i3) +; j1 = i1 + 1 ---> i3 = phi(j1, j2) <--- j2 = i2 + 2 +; ^ | ^ +; | V | +; +-------- switch (tid / i3) ----------+ +; | +; V +; if (i3 == 5) // divergent +; because sync dependent on (tid / i3). +define i32 @unstructured_loop(i1 %entry_cond) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'unstructured_loop' +entry: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + br i1 %entry_cond, label %loop_entry_1, label %loop_entry_2 +loop_entry_1: + %i1 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ] + %j1 = add i32 %i1, 1 + br label %loop_body +loop_entry_2: + %i2 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ] + %j2 = add i32 %i2, 2 + br label %loop_body +loop_body: + %i3 = phi i32 [ %j1, %loop_entry_1 ], [ %j2, %loop_entry_2 ] + br label %loop_latch +loop_latch: + %div = sdiv i32 %tid, %i3 + switch i32 %div, label %branch [ i32 1, label %loop_entry_1 + i32 2, label %loop_entry_2 ] +branch: + %cmp = icmp eq i32 %i3, 5 + br i1 %cmp, label %then, label %else +; CHECK: DIVERGENT: br i1 %cmp, +then: + ret i32 0 +else: + ret i32 1 +} + +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() +declare i32 @llvm.nvvm.read.ptx.sreg.laneid() + +!nvvm.annotations = !{!0} +!0 = !{i32 (i1)* @unstructured_loop, !"kernel", i32 1} Index: test/Analysis/DivergenceAnalysis/NVPTX/lit.local.cfg =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/NVPTX/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'NVPTX' in config.root.targets: + config.unsupported = True