Index: llvm/trunk/include/llvm/Analysis/DivergenceAnalysis.h =================================================================== --- llvm/trunk/include/llvm/Analysis/DivergenceAnalysis.h +++ llvm/trunk/include/llvm/Analysis/DivergenceAnalysis.h @@ -1,61 +0,0 @@ -//===- llvm/Analysis/DivergenceAnalysis.h - Divergence Analysis -*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// The divergence analysis is an LLVM pass which can be used to find out -// if a branch instruction in a GPU program is divergent or not. It can help -// branch optimizations such as jump threading and loop unswitching to make -// better decisions. -// -//===----------------------------------------------------------------------===// -#ifndef LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H -#define LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H - -#include "llvm/ADT/DenseSet.h" -#include "llvm/IR/Function.h" -#include "llvm/Pass.h" - -namespace llvm { -class Value; -class DivergenceAnalysis : public FunctionPass { -public: - static char ID; - - DivergenceAnalysis() : FunctionPass(ID) { - initializeDivergenceAnalysisPass(*PassRegistry::getPassRegistry()); - } - - void getAnalysisUsage(AnalysisUsage &AU) const override; - - bool runOnFunction(Function &F) override; - - // Print all divergent branches in the function. - void print(raw_ostream &OS, const Module *) const override; - - // Returns true if V is divergent at its definition. - // - // Even if this function returns false, V may still be divergent when used - // in a different basic block. - bool isDivergent(const Value *V) const { return DivergentValues.count(V); } - - // Returns true if V is uniform/non-divergent. - // - // Even if this function returns true, V may still be divergent when used - // in a different basic block. - bool isUniform(const Value *V) const { return !isDivergent(V); } - - // Keep the analysis results uptodate by removing an erased value. - void removeValue(const Value *V) { DivergentValues.erase(V); } - -private: - // Stores all divergent values. - DenseSet DivergentValues; -}; -} // End llvm namespace - -#endif //LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H \ No newline at end of file Index: llvm/trunk/include/llvm/Analysis/LegacyDivergenceAnalysis.h =================================================================== --- llvm/trunk/include/llvm/Analysis/LegacyDivergenceAnalysis.h +++ llvm/trunk/include/llvm/Analysis/LegacyDivergenceAnalysis.h @@ -0,0 +1,61 @@ +//===- llvm/Analysis/LegacyDivergenceAnalysis.h - KernelDivergence Analysis -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// The kernel divergence analysis is an LLVM pass which can be used to find out +// if a branch instruction in a GPU program (kernel) is divergent or not. It can help +// branch optimizations such as jump threading and loop unswitching to make +// better decisions. +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_ANALYSIS_LEGACY_DIVERGENCE_ANALYSIS_H +#define LLVM_ANALYSIS_LEGACY_DIVERGENCE_ANALYSIS_H + +#include "llvm/ADT/DenseSet.h" +#include "llvm/IR/Function.h" +#include "llvm/Pass.h" + +namespace llvm { +class Value; +class LegacyDivergenceAnalysis : public FunctionPass { +public: + static char ID; + + LegacyDivergenceAnalysis() : FunctionPass(ID) { + initializeLegacyDivergenceAnalysisPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override; + + bool runOnFunction(Function &F) override; + + // Print all divergent branches in the function. + void print(raw_ostream &OS, const Module *) const override; + + // Returns true if V is divergent at its definition. + // + // Even if this function returns false, V may still be divergent when used + // in a different basic block. + bool isDivergent(const Value *V) const { return DivergentValues.count(V); } + + // Returns true if V is uniform/non-divergent. + // + // Even if this function returns true, V may still be divergent when used + // in a different basic block. + bool isUniform(const Value *V) const { return !isDivergent(V); } + + // Keep the analysis results uptodate by removing an erased value. + void removeValue(const Value *V) { DivergentValues.erase(V); } + +private: + // Stores all divergent values. + DenseSet DivergentValues; +}; +} // End llvm namespace + +#endif //LLVM_ANALYSIS_LEGACY_DIVERGENCE_ANALYSIS_H Index: llvm/trunk/include/llvm/Analysis/Passes.h =================================================================== --- llvm/trunk/include/llvm/Analysis/Passes.h +++ llvm/trunk/include/llvm/Analysis/Passes.h @@ -61,10 +61,10 @@ //===--------------------------------------------------------------------===// // - // createDivergenceAnalysisPass - This pass determines which branches in a GPU + // createLegacyDivergenceAnalysisPass - This pass determines which branches in a GPU // program are divergent. // - FunctionPass *createDivergenceAnalysisPass(); + FunctionPass *createLegacyDivergenceAnalysisPass(); //===--------------------------------------------------------------------===// // Index: llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h @@ -289,7 +289,7 @@ /// Returns whether V is a source of divergence. /// /// This function provides the target-dependent information for - /// the target-independent DivergenceAnalysis. DivergenceAnalysis first + /// the target-independent LegacyDivergenceAnalysis. LegacyDivergenceAnalysis first /// builds the dependency graph, and then runs the reachability algorithm /// starting with the sources of divergence. bool isSourceOfDivergence(const Value *V) const; Index: llvm/trunk/include/llvm/CodeGen/SelectionDAG.h =================================================================== --- llvm/trunk/include/llvm/CodeGen/SelectionDAG.h +++ llvm/trunk/include/llvm/CodeGen/SelectionDAG.h @@ -28,7 +28,7 @@ #include "llvm/ADT/iterator.h" #include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/CodeGen/DAGCombine.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/ISDOpcodes.h" @@ -229,7 +229,7 @@ LLVMContext *Context; CodeGenOpt::Level OptLevel; - DivergenceAnalysis * DA = nullptr; + LegacyDivergenceAnalysis * DA = nullptr; FunctionLoweringInfo * FLI = nullptr; /// The function-level optimization remark emitter. Used to emit remarks @@ -382,7 +382,7 @@ /// Prepare this SelectionDAG to process code in the given MachineFunction. void init(MachineFunction &NewMF, OptimizationRemarkEmitter &NewORE, Pass *PassPtr, const TargetLibraryInfo *LibraryInfo, - DivergenceAnalysis * Divergence); + LegacyDivergenceAnalysis * Divergence); void setFunctionLoweringInfo(FunctionLoweringInfo * FuncInfo) { FLI = FuncInfo; Index: llvm/trunk/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/trunk/include/llvm/CodeGen/TargetLowering.h +++ llvm/trunk/include/llvm/CodeGen/TargetLowering.h @@ -29,7 +29,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" -#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/CodeGen/DAGCombine.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/RuntimeLibcalls.h" @@ -2655,7 +2655,7 @@ virtual bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, - DivergenceAnalysis *DA) const { + LegacyDivergenceAnalysis *DA) const { return false; } Index: llvm/trunk/include/llvm/InitializePasses.h =================================================================== --- llvm/trunk/include/llvm/InitializePasses.h +++ llvm/trunk/include/llvm/InitializePasses.h @@ -119,7 +119,6 @@ void initializeDependenceAnalysisWrapperPassPass(PassRegistry&); void initializeDetectDeadLanesPass(PassRegistry&); void initializeDivRemPairsLegacyPassPass(PassRegistry&); -void initializeDivergenceAnalysisPass(PassRegistry&); void initializeDomOnlyPrinterPass(PassRegistry&); void initializeDomOnlyViewerPass(PassRegistry&); void initializeDomPrinterPass(PassRegistry&); @@ -191,6 +190,7 @@ void initializeLazyMachineBlockFrequencyInfoPassPass(PassRegistry&); void initializeLazyValueInfoPrinterPass(PassRegistry&); void initializeLazyValueInfoWrapperPassPass(PassRegistry&); +void initializeLegacyDivergenceAnalysisPass(PassRegistry&); void initializeLegacyLICMPassPass(PassRegistry&); void initializeLegacyLoopSinkPassPass(PassRegistry&); void initializeLegalizerPass(PassRegistry&); Index: llvm/trunk/include/llvm/LinkAllPasses.h =================================================================== --- llvm/trunk/include/llvm/LinkAllPasses.h +++ llvm/trunk/include/llvm/LinkAllPasses.h @@ -94,7 +94,6 @@ (void) llvm::createDeadInstEliminationPass(); (void) llvm::createDeadStoreEliminationPass(); (void) llvm::createDependenceAnalysisWrapperPass(); - (void) llvm::createDivergenceAnalysisPass(); (void) llvm::createDomOnlyPrinterPass(); (void) llvm::createDomPrinterPass(); (void) llvm::createDomOnlyViewerPass(); @@ -121,6 +120,7 @@ (void) llvm::createInstructionCombiningPass(); (void) llvm::createInternalizePass(); (void) llvm::createLCSSAPass(); + (void) llvm::createLegacyDivergenceAnalysisPass(); (void) llvm::createLICMPass(); (void) llvm::createLoopSinkPass(); (void) llvm::createLazyValueInfoPass(); Index: llvm/trunk/lib/Analysis/Analysis.cpp =================================================================== --- llvm/trunk/lib/Analysis/Analysis.cpp +++ llvm/trunk/lib/Analysis/Analysis.cpp @@ -39,7 +39,6 @@ initializeDependenceAnalysisWrapperPassPass(Registry); initializeDelinearizationPass(Registry); initializeDemandedBitsWrapperPassPass(Registry); - initializeDivergenceAnalysisPass(Registry); initializeDominanceFrontierWrapperPassPass(Registry); initializeDomViewerPass(Registry); initializeDomPrinterPass(Registry); @@ -58,6 +57,7 @@ initializeLazyBlockFrequencyInfoPassPass(Registry); initializeLazyValueInfoWrapperPassPass(Registry); initializeLazyValueInfoPrinterPass(Registry); + initializeLegacyDivergenceAnalysisPass(Registry); initializeLintPass(Registry); initializeLoopInfoWrapperPassPass(Registry); initializeMemDepPrinterPass(Registry); Index: llvm/trunk/lib/Analysis/CMakeLists.txt =================================================================== --- llvm/trunk/lib/Analysis/CMakeLists.txt +++ llvm/trunk/lib/Analysis/CMakeLists.txt @@ -25,7 +25,6 @@ Delinearization.cpp DemandedBits.cpp DependenceAnalysis.cpp - DivergenceAnalysis.cpp DomPrinter.cpp DominanceFrontier.cpp EHPersonalities.cpp @@ -44,6 +43,7 @@ LazyBlockFrequencyInfo.cpp LazyCallGraph.cpp LazyValueInfo.cpp + LegacyDivergenceAnalysis.cpp Lint.cpp Loads.cpp LoopAccessAnalysis.cpp Index: llvm/trunk/lib/Analysis/DivergenceAnalysis.cpp =================================================================== --- llvm/trunk/lib/Analysis/DivergenceAnalysis.cpp +++ llvm/trunk/lib/Analysis/DivergenceAnalysis.cpp @@ -1,340 +0,0 @@ -//===- DivergenceAnalysis.cpp --------- Divergence Analysis Implementation -==// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements divergence analysis which determines whether a branch -// in a GPU program is divergent.It can help branch optimizations such as jump -// threading and loop unswitching to make better decisions. -// -// GPU programs typically use the SIMD execution model, where multiple threads -// in the same execution group have to execute in lock-step. Therefore, if the -// code contains divergent branches (i.e., threads in a group do not agree on -// which path of the branch to take), the group of threads has to execute all -// the paths from that branch with different subsets of threads enabled until -// they converge at the immediately post-dominating BB of the paths. -// -// Due to this execution model, some optimizations such as jump -// threading and loop unswitching can be unfortunately harmful when performed on -// divergent branches. Therefore, an analysis that computes which branches in a -// GPU program are divergent can help the compiler to selectively run these -// optimizations. -// -// This file defines divergence analysis which computes a conservative but -// non-trivial approximation of all divergent branches in a GPU program. It -// partially implements the approach described in -// -// Divergence Analysis -// Sampaio, Souza, Collange, Pereira -// TOPLAS '13 -// -// The divergence analysis identifies the sources of divergence (e.g., special -// variables that hold the thread ID), and recursively marks variables that are -// data or sync dependent on a source of divergence as divergent. -// -// While data dependency is a well-known concept, the notion of sync dependency -// is worth more explanation. Sync dependence characterizes the control flow -// aspect of the propagation of branch divergence. For example, -// -// %cond = icmp slt i32 %tid, 10 -// br i1 %cond, label %then, label %else -// then: -// br label %merge -// else: -// br label %merge -// merge: -// %a = phi i32 [ 0, %then ], [ 1, %else ] -// -// Suppose %tid holds the thread ID. Although %a is not data dependent on %tid -// because %tid is not on its use-def chains, %a is sync dependent on %tid -// because the branch "br i1 %cond" depends on %tid and affects which value %a -// is assigned to. -// -// The current implementation has the following limitations: -// 1. intra-procedural. It conservatively considers the arguments of a -// non-kernel-entry function and the return value of a function call as -// divergent. -// 2. memory as black box. It conservatively considers values loaded from -// generic or local address as divergent. This can be improved by leveraging -// pointer analysis. -// -//===----------------------------------------------------------------------===// - -#include "llvm/Analysis/DivergenceAnalysis.h" -#include "llvm/Analysis/Passes.h" -#include "llvm/Analysis/PostDominators.h" -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/IR/Dominators.h" -#include "llvm/IR/InstIterator.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Value.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include -using namespace llvm; - -#define DEBUG_TYPE "divergence" - -namespace { - -class DivergencePropagator { -public: - DivergencePropagator(Function &F, TargetTransformInfo &TTI, DominatorTree &DT, - PostDominatorTree &PDT, DenseSet &DV) - : F(F), TTI(TTI), DT(DT), PDT(PDT), DV(DV) {} - void populateWithSourcesOfDivergence(); - void propagate(); - -private: - // A helper function that explores data dependents of V. - void exploreDataDependency(Value *V); - // A helper function that explores sync dependents of TI. - void exploreSyncDependency(TerminatorInst *TI); - // Computes the influence region from Start to End. This region includes all - // basic blocks on any simple path from Start to End. - void computeInfluenceRegion(BasicBlock *Start, BasicBlock *End, - DenseSet &InfluenceRegion); - // Finds all users of I that are outside the influence region, and add these - // users to Worklist. - void findUsersOutsideInfluenceRegion( - Instruction &I, const DenseSet &InfluenceRegion); - - Function &F; - TargetTransformInfo &TTI; - DominatorTree &DT; - PostDominatorTree &PDT; - std::vector Worklist; // Stack for DFS. - DenseSet &DV; // Stores all divergent values. -}; - -void DivergencePropagator::populateWithSourcesOfDivergence() { - Worklist.clear(); - DV.clear(); - for (auto &I : instructions(F)) { - if (TTI.isSourceOfDivergence(&I)) { - Worklist.push_back(&I); - DV.insert(&I); - } - } - for (auto &Arg : F.args()) { - if (TTI.isSourceOfDivergence(&Arg)) { - Worklist.push_back(&Arg); - DV.insert(&Arg); - } - } -} - -void DivergencePropagator::exploreSyncDependency(TerminatorInst *TI) { - // Propagation rule 1: if branch TI is divergent, all PHINodes in TI's - // immediate post dominator are divergent. This rule handles if-then-else - // patterns. For example, - // - // if (tid < 5) - // a1 = 1; - // else - // a2 = 2; - // a = phi(a1, a2); // sync dependent on (tid < 5) - BasicBlock *ThisBB = TI->getParent(); - - // Unreachable blocks may not be in the dominator tree. - if (!DT.isReachableFromEntry(ThisBB)) - return; - - // If the function has no exit blocks or doesn't reach any exit blocks, the - // post dominator may be null. - DomTreeNode *ThisNode = PDT.getNode(ThisBB); - if (!ThisNode) - return; - - BasicBlock *IPostDom = ThisNode->getIDom()->getBlock(); - if (IPostDom == nullptr) - return; - - for (auto I = IPostDom->begin(); isa(I); ++I) { - // A PHINode is uniform if it returns the same value no matter which path is - // taken. - if (!cast(I)->hasConstantOrUndefValue() && DV.insert(&*I).second) - Worklist.push_back(&*I); - } - - // Propagation rule 2: if a value defined in a loop is used outside, the user - // is sync dependent on the condition of the loop exits that dominate the - // user. For example, - // - // int i = 0; - // do { - // i++; - // if (foo(i)) ... // uniform - // } while (i < tid); - // if (bar(i)) ... // divergent - // - // A program may contain unstructured loops. Therefore, we cannot leverage - // LoopInfo, which only recognizes natural loops. - // - // The algorithm used here handles both natural and unstructured loops. Given - // a branch TI, we first compute its influence region, the union of all simple - // paths from TI to its immediate post dominator (IPostDom). Then, we search - // for all the values defined in the influence region but used outside. All - // these users are sync dependent on TI. - DenseSet InfluenceRegion; - computeInfluenceRegion(ThisBB, IPostDom, InfluenceRegion); - // An insight that can speed up the search process is that all the in-region - // values that are used outside must dominate TI. Therefore, instead of - // searching every basic blocks in the influence region, we search all the - // dominators of TI until it is outside the influence region. - BasicBlock *InfluencedBB = ThisBB; - while (InfluenceRegion.count(InfluencedBB)) { - for (auto &I : *InfluencedBB) - findUsersOutsideInfluenceRegion(I, InfluenceRegion); - DomTreeNode *IDomNode = DT.getNode(InfluencedBB)->getIDom(); - if (IDomNode == nullptr) - break; - InfluencedBB = IDomNode->getBlock(); - } -} - -void DivergencePropagator::findUsersOutsideInfluenceRegion( - Instruction &I, const DenseSet &InfluenceRegion) { - for (User *U : I.users()) { - Instruction *UserInst = cast(U); - if (!InfluenceRegion.count(UserInst->getParent())) { - if (DV.insert(UserInst).second) - Worklist.push_back(UserInst); - } - } -} - -// A helper function for computeInfluenceRegion that adds successors of "ThisBB" -// to the influence region. -static void -addSuccessorsToInfluenceRegion(BasicBlock *ThisBB, BasicBlock *End, - DenseSet &InfluenceRegion, - std::vector &InfluenceStack) { - for (BasicBlock *Succ : successors(ThisBB)) { - if (Succ != End && InfluenceRegion.insert(Succ).second) - InfluenceStack.push_back(Succ); - } -} - -void DivergencePropagator::computeInfluenceRegion( - BasicBlock *Start, BasicBlock *End, - DenseSet &InfluenceRegion) { - assert(PDT.properlyDominates(End, Start) && - "End does not properly dominate Start"); - - // The influence region starts from the end of "Start" to the beginning of - // "End". Therefore, "Start" should not be in the region unless "Start" is in - // a loop that doesn't contain "End". - std::vector InfluenceStack; - addSuccessorsToInfluenceRegion(Start, End, InfluenceRegion, InfluenceStack); - while (!InfluenceStack.empty()) { - BasicBlock *BB = InfluenceStack.back(); - InfluenceStack.pop_back(); - addSuccessorsToInfluenceRegion(BB, End, InfluenceRegion, InfluenceStack); - } -} - -void DivergencePropagator::exploreDataDependency(Value *V) { - // Follow def-use chains of V. - for (User *U : V->users()) { - Instruction *UserInst = cast(U); - if (!TTI.isAlwaysUniform(U) && DV.insert(UserInst).second) - Worklist.push_back(UserInst); - } -} - -void DivergencePropagator::propagate() { - // Traverse the dependency graph using DFS. - while (!Worklist.empty()) { - Value *V = Worklist.back(); - Worklist.pop_back(); - if (TerminatorInst *TI = dyn_cast(V)) { - // Terminators with less than two successors won't introduce sync - // dependency. Ignore them. - if (TI->getNumSuccessors() > 1) - exploreSyncDependency(TI); - } - exploreDataDependency(V); - } -} - -} /// end namespace anonymous - -// Register this pass. -char DivergenceAnalysis::ID = 0; -INITIALIZE_PASS_BEGIN(DivergenceAnalysis, "divergence", "Divergence Analysis", - false, true) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) -INITIALIZE_PASS_END(DivergenceAnalysis, "divergence", "Divergence Analysis", - false, true) - -FunctionPass *llvm::createDivergenceAnalysisPass() { - return new DivergenceAnalysis(); -} - -void DivergenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired(); - AU.addRequired(); - AU.setPreservesAll(); -} - -bool DivergenceAnalysis::runOnFunction(Function &F) { - auto *TTIWP = getAnalysisIfAvailable(); - if (TTIWP == nullptr) - return false; - - TargetTransformInfo &TTI = TTIWP->getTTI(F); - // Fast path: if the target does not have branch divergence, we do not mark - // any branch as divergent. - if (!TTI.hasBranchDivergence()) - return false; - - DivergentValues.clear(); - auto &PDT = getAnalysis().getPostDomTree(); - DivergencePropagator DP(F, TTI, - getAnalysis().getDomTree(), - PDT, DivergentValues); - DP.populateWithSourcesOfDivergence(); - DP.propagate(); - LLVM_DEBUG( - dbgs() << "\nAfter divergence analysis on " << F.getName() << ":\n"; - print(dbgs(), F.getParent()) - ); - return false; -} - -void DivergenceAnalysis::print(raw_ostream &OS, const Module *) const { - if (DivergentValues.empty()) - return; - const Value *FirstDivergentValue = *DivergentValues.begin(); - const Function *F; - if (const Argument *Arg = dyn_cast(FirstDivergentValue)) { - F = Arg->getParent(); - } else if (const Instruction *I = - dyn_cast(FirstDivergentValue)) { - F = I->getParent()->getParent(); - } else { - llvm_unreachable("Only arguments and instructions can be divergent"); - } - - // Dumps all divergent values in F, arguments and then instructions. - for (auto &Arg : F->args()) { - OS << (DivergentValues.count(&Arg) ? "DIVERGENT: " : " "); - OS << Arg << "\n"; - } - // Iterate instructions using instructions() to ensure a deterministic order. - for (auto BI = F->begin(), BE = F->end(); BI != BE; ++BI) { - auto &BB = *BI; - OS << "\n " << BB.getName() << ":\n"; - for (auto &I : BB.instructionsWithoutDebug()) { - OS << (DivergentValues.count(&I) ? "DIVERGENT: " : " "); - OS << I << "\n"; - } - } - OS << "\n"; -} Index: llvm/trunk/lib/Analysis/LegacyDivergenceAnalysis.cpp =================================================================== --- llvm/trunk/lib/Analysis/LegacyDivergenceAnalysis.cpp +++ llvm/trunk/lib/Analysis/LegacyDivergenceAnalysis.cpp @@ -0,0 +1,340 @@ +//===- LegacyDivergenceAnalysis.cpp --------- Legacy Divergence Analysis Implementation -==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements divergence analysis which determines whether a branch +// in a GPU program is divergent.It can help branch optimizations such as jump +// threading and loop unswitching to make better decisions. +// +// GPU programs typically use the SIMD execution model, where multiple threads +// in the same execution group have to execute in lock-step. Therefore, if the +// code contains divergent branches (i.e., threads in a group do not agree on +// which path of the branch to take), the group of threads has to execute all +// the paths from that branch with different subsets of threads enabled until +// they converge at the immediately post-dominating BB of the paths. +// +// Due to this execution model, some optimizations such as jump +// threading and loop unswitching can be unfortunately harmful when performed on +// divergent branches. Therefore, an analysis that computes which branches in a +// GPU program are divergent can help the compiler to selectively run these +// optimizations. +// +// This file defines divergence analysis which computes a conservative but +// non-trivial approximation of all divergent branches in a GPU program. It +// partially implements the approach described in +// +// Divergence Analysis +// Sampaio, Souza, Collange, Pereira +// TOPLAS '13 +// +// The divergence analysis identifies the sources of divergence (e.g., special +// variables that hold the thread ID), and recursively marks variables that are +// data or sync dependent on a source of divergence as divergent. +// +// While data dependency is a well-known concept, the notion of sync dependency +// is worth more explanation. Sync dependence characterizes the control flow +// aspect of the propagation of branch divergence. For example, +// +// %cond = icmp slt i32 %tid, 10 +// br i1 %cond, label %then, label %else +// then: +// br label %merge +// else: +// br label %merge +// merge: +// %a = phi i32 [ 0, %then ], [ 1, %else ] +// +// Suppose %tid holds the thread ID. Although %a is not data dependent on %tid +// because %tid is not on its use-def chains, %a is sync dependent on %tid +// because the branch "br i1 %cond" depends on %tid and affects which value %a +// is assigned to. +// +// The current implementation has the following limitations: +// 1. intra-procedural. It conservatively considers the arguments of a +// non-kernel-entry function and the return value of a function call as +// divergent. +// 2. memory as black box. It conservatively considers values loaded from +// generic or local address as divergent. This can be improved by leveraging +// pointer analysis. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/LegacyDivergenceAnalysis.h" +#include "llvm/Analysis/Passes.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include +using namespace llvm; + +#define DEBUG_TYPE "divergence" + +namespace { + +class DivergencePropagator { +public: + DivergencePropagator(Function &F, TargetTransformInfo &TTI, DominatorTree &DT, + PostDominatorTree &PDT, DenseSet &DV) + : F(F), TTI(TTI), DT(DT), PDT(PDT), DV(DV) {} + void populateWithSourcesOfDivergence(); + void propagate(); + +private: + // A helper function that explores data dependents of V. + void exploreDataDependency(Value *V); + // A helper function that explores sync dependents of TI. + void exploreSyncDependency(TerminatorInst *TI); + // Computes the influence region from Start to End. This region includes all + // basic blocks on any simple path from Start to End. + void computeInfluenceRegion(BasicBlock *Start, BasicBlock *End, + DenseSet &InfluenceRegion); + // Finds all users of I that are outside the influence region, and add these + // users to Worklist. + void findUsersOutsideInfluenceRegion( + Instruction &I, const DenseSet &InfluenceRegion); + + Function &F; + TargetTransformInfo &TTI; + DominatorTree &DT; + PostDominatorTree &PDT; + std::vector Worklist; // Stack for DFS. + DenseSet &DV; // Stores all divergent values. +}; + +void DivergencePropagator::populateWithSourcesOfDivergence() { + Worklist.clear(); + DV.clear(); + for (auto &I : instructions(F)) { + if (TTI.isSourceOfDivergence(&I)) { + Worklist.push_back(&I); + DV.insert(&I); + } + } + for (auto &Arg : F.args()) { + if (TTI.isSourceOfDivergence(&Arg)) { + Worklist.push_back(&Arg); + DV.insert(&Arg); + } + } +} + +void DivergencePropagator::exploreSyncDependency(TerminatorInst *TI) { + // Propagation rule 1: if branch TI is divergent, all PHINodes in TI's + // immediate post dominator are divergent. This rule handles if-then-else + // patterns. For example, + // + // if (tid < 5) + // a1 = 1; + // else + // a2 = 2; + // a = phi(a1, a2); // sync dependent on (tid < 5) + BasicBlock *ThisBB = TI->getParent(); + + // Unreachable blocks may not be in the dominator tree. + if (!DT.isReachableFromEntry(ThisBB)) + return; + + // If the function has no exit blocks or doesn't reach any exit blocks, the + // post dominator may be null. + DomTreeNode *ThisNode = PDT.getNode(ThisBB); + if (!ThisNode) + return; + + BasicBlock *IPostDom = ThisNode->getIDom()->getBlock(); + if (IPostDom == nullptr) + return; + + for (auto I = IPostDom->begin(); isa(I); ++I) { + // A PHINode is uniform if it returns the same value no matter which path is + // taken. + if (!cast(I)->hasConstantOrUndefValue() && DV.insert(&*I).second) + Worklist.push_back(&*I); + } + + // Propagation rule 2: if a value defined in a loop is used outside, the user + // is sync dependent on the condition of the loop exits that dominate the + // user. For example, + // + // int i = 0; + // do { + // i++; + // if (foo(i)) ... // uniform + // } while (i < tid); + // if (bar(i)) ... // divergent + // + // A program may contain unstructured loops. Therefore, we cannot leverage + // LoopInfo, which only recognizes natural loops. + // + // The algorithm used here handles both natural and unstructured loops. Given + // a branch TI, we first compute its influence region, the union of all simple + // paths from TI to its immediate post dominator (IPostDom). Then, we search + // for all the values defined in the influence region but used outside. All + // these users are sync dependent on TI. + DenseSet InfluenceRegion; + computeInfluenceRegion(ThisBB, IPostDom, InfluenceRegion); + // An insight that can speed up the search process is that all the in-region + // values that are used outside must dominate TI. Therefore, instead of + // searching every basic blocks in the influence region, we search all the + // dominators of TI until it is outside the influence region. + BasicBlock *InfluencedBB = ThisBB; + while (InfluenceRegion.count(InfluencedBB)) { + for (auto &I : *InfluencedBB) + findUsersOutsideInfluenceRegion(I, InfluenceRegion); + DomTreeNode *IDomNode = DT.getNode(InfluencedBB)->getIDom(); + if (IDomNode == nullptr) + break; + InfluencedBB = IDomNode->getBlock(); + } +} + +void DivergencePropagator::findUsersOutsideInfluenceRegion( + Instruction &I, const DenseSet &InfluenceRegion) { + for (User *U : I.users()) { + Instruction *UserInst = cast(U); + if (!InfluenceRegion.count(UserInst->getParent())) { + if (DV.insert(UserInst).second) + Worklist.push_back(UserInst); + } + } +} + +// A helper function for computeInfluenceRegion that adds successors of "ThisBB" +// to the influence region. +static void +addSuccessorsToInfluenceRegion(BasicBlock *ThisBB, BasicBlock *End, + DenseSet &InfluenceRegion, + std::vector &InfluenceStack) { + for (BasicBlock *Succ : successors(ThisBB)) { + if (Succ != End && InfluenceRegion.insert(Succ).second) + InfluenceStack.push_back(Succ); + } +} + +void DivergencePropagator::computeInfluenceRegion( + BasicBlock *Start, BasicBlock *End, + DenseSet &InfluenceRegion) { + assert(PDT.properlyDominates(End, Start) && + "End does not properly dominate Start"); + + // The influence region starts from the end of "Start" to the beginning of + // "End". Therefore, "Start" should not be in the region unless "Start" is in + // a loop that doesn't contain "End". + std::vector InfluenceStack; + addSuccessorsToInfluenceRegion(Start, End, InfluenceRegion, InfluenceStack); + while (!InfluenceStack.empty()) { + BasicBlock *BB = InfluenceStack.back(); + InfluenceStack.pop_back(); + addSuccessorsToInfluenceRegion(BB, End, InfluenceRegion, InfluenceStack); + } +} + +void DivergencePropagator::exploreDataDependency(Value *V) { + // Follow def-use chains of V. + for (User *U : V->users()) { + Instruction *UserInst = cast(U); + if (!TTI.isAlwaysUniform(U) && DV.insert(UserInst).second) + Worklist.push_back(UserInst); + } +} + +void DivergencePropagator::propagate() { + // Traverse the dependency graph using DFS. + while (!Worklist.empty()) { + Value *V = Worklist.back(); + Worklist.pop_back(); + if (TerminatorInst *TI = dyn_cast(V)) { + // Terminators with less than two successors won't introduce sync + // dependency. Ignore them. + if (TI->getNumSuccessors() > 1) + exploreSyncDependency(TI); + } + exploreDataDependency(V); + } +} + +} /// end namespace anonymous + +// Register this pass. +char LegacyDivergenceAnalysis::ID = 0; +INITIALIZE_PASS_BEGIN(LegacyDivergenceAnalysis, "divergence", "Legacy Divergence Analysis", + false, true) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) +INITIALIZE_PASS_END(LegacyDivergenceAnalysis, "divergence", "Legacy Divergence Analysis", + false, true) + +FunctionPass *llvm::createLegacyDivergenceAnalysisPass() { + return new LegacyDivergenceAnalysis(); +} + +void LegacyDivergenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + AU.addRequired(); + AU.setPreservesAll(); +} + +bool LegacyDivergenceAnalysis::runOnFunction(Function &F) { + auto *TTIWP = getAnalysisIfAvailable(); + if (TTIWP == nullptr) + return false; + + TargetTransformInfo &TTI = TTIWP->getTTI(F); + // Fast path: if the target does not have branch divergence, we do not mark + // any branch as divergent. + if (!TTI.hasBranchDivergence()) + return false; + + DivergentValues.clear(); + auto &PDT = getAnalysis().getPostDomTree(); + DivergencePropagator DP(F, TTI, + getAnalysis().getDomTree(), + PDT, DivergentValues); + DP.populateWithSourcesOfDivergence(); + DP.propagate(); + LLVM_DEBUG( + dbgs() << "\nAfter divergence analysis on " << F.getName() << ":\n"; + print(dbgs(), F.getParent()) + ); + return false; +} + +void LegacyDivergenceAnalysis::print(raw_ostream &OS, const Module *) const { + if (DivergentValues.empty()) + return; + const Value *FirstDivergentValue = *DivergentValues.begin(); + const Function *F; + if (const Argument *Arg = dyn_cast(FirstDivergentValue)) { + F = Arg->getParent(); + } else if (const Instruction *I = + dyn_cast(FirstDivergentValue)) { + F = I->getParent()->getParent(); + } else { + llvm_unreachable("Only arguments and instructions can be divergent"); + } + + // Dumps all divergent values in F, arguments and then instructions. + for (auto &Arg : F->args()) { + OS << (DivergentValues.count(&Arg) ? "DIVERGENT: " : " "); + OS << Arg << "\n"; + } + // Iterate instructions using instructions() to ensure a deterministic order. + for (auto BI = F->begin(), BE = F->end(); BI != BE; ++BI) { + auto &BB = *BI; + OS << "\n " << BB.getName() << ":\n"; + for (auto &I : BB.instructionsWithoutDebug()) { + OS << (DivergentValues.count(&I) ? "DIVERGENT: " : " "); + OS << I << "\n"; + } + } + OS << "\n"; +} Index: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -984,7 +984,7 @@ void SelectionDAG::init(MachineFunction &NewMF, OptimizationRemarkEmitter &NewORE, Pass *PassPtr, const TargetLibraryInfo *LibraryInfo, - DivergenceAnalysis * Divergence) { + LegacyDivergenceAnalysis * Divergence) { MF = &NewMF; SDAGISelPass = PassPtr; ORE = &NewORE; Index: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -417,7 +417,7 @@ SplitCriticalSideEffectEdges(const_cast(Fn), DT, LI); CurDAG->init(*MF, *ORE, this, LibInfo, - getAnalysisIfAvailable()); + getAnalysisIfAvailable()); FuncInfo->set(Fn, *MF, CurDAG); // Now get the optional analyzes if we want to. Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -16,7 +16,7 @@ #include "AMDGPU.h" #include "AMDGPUIntrinsicInfo.h" #include "llvm/ADT/SetVector.h" -#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/IR/IRBuilder.h" @@ -32,7 +32,7 @@ class AMDGPUAnnotateUniformValues : public FunctionPass, public InstVisitor { - DivergenceAnalysis *DA; + LegacyDivergenceAnalysis *DA; MemoryDependenceResults *MDR; LoopInfo *LI; DenseMap noClobberClones; @@ -49,7 +49,7 @@ return "AMDGPU Annotate Uniform Values"; } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.setPreservesAll(); @@ -64,7 +64,7 @@ INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE, "Add AMDGPU uniform metadata", false, false) -INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE, @@ -176,7 +176,7 @@ if (skipFunction(F)) return false; - DA = &getAnalysis(); + DA = &getAnalysis(); MDR = &getAnalysis().getMemDep(); LI = &getAnalysis().getLoopInfo(); isKernelFunc = F.getCallingConv() == CallingConv::AMDGPU_KERNEL; Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -18,7 +18,7 @@ #include "AMDGPUTargetMachine.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AssumptionCache.h" -#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/Passes.h" @@ -60,7 +60,7 @@ public InstVisitor { const GCNSubtarget *ST = nullptr; AssumptionCache *AC = nullptr; - DivergenceAnalysis *DA = nullptr; + LegacyDivergenceAnalysis *DA = nullptr; Module *Mod = nullptr; bool HasUnsafeFPMath = false; AMDGPUAS AMDGPUASI; @@ -177,7 +177,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); - AU.addRequired(); + AU.addRequired(); AU.setPreservesAll(); } }; @@ -898,7 +898,7 @@ const AMDGPUTargetMachine &TM = TPC->getTM(); ST = &TM.getSubtarget(F); AC = &getAnalysis().getAssumptionCache(F); - DA = &getAnalysis(); + DA = &getAnalysis(); HasUnsafeFPMath = hasUnsafeFPMath(F); AMDGPUASI = TM.getAMDGPUAS(); @@ -918,7 +918,7 @@ INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", false, false) Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -29,7 +29,7 @@ #include "llvm/ADT/APInt.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" -#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/ISDOpcodes.h" @@ -87,7 +87,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); - AU.addRequired(); + AU.addRequired(); SelectionDAGISel::getAnalysisUsage(AU); } @@ -253,7 +253,7 @@ "AMDGPU DAG->DAG Pattern Instruction Selection", false, false) INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo) INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis) -INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "isel", "AMDGPU DAG->DAG Pattern Instruction Selection", false, false) Index: llvm/trunk/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -16,7 +16,6 @@ #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" #include "llvm/ADT/StringRef.h" -#include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/Analysis/Loads.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -25,7 +25,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" -#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Transforms/Utils/Local.h" @@ -70,7 +70,7 @@ INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE, "Unify divergent function exit nodes", false, false) INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE, "Unify divergent function exit nodes", false, false) @@ -78,10 +78,10 @@ // TODO: Preserve dominator tree. AU.addRequired(); - AU.addRequired(); + AU.addRequired(); // No divergent values are changed, only blocks and branch edges. - AU.addPreserved(); + AU.addPreserved(); // We preserve the non-critical-edgeness property AU.addPreservedID(BreakCriticalEdgesID); @@ -95,7 +95,7 @@ /// \returns true if \p BB is reachable through only uniform branches. /// XXX - Is there a more efficient way to find this? -static bool isUniformlyReached(const DivergenceAnalysis &DA, +static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA, BasicBlock &BB) { SmallVector Stack; SmallPtrSet Visited; @@ -163,7 +163,7 @@ if (PDT.getRoots().size() <= 1) return false; - DivergenceAnalysis &DA = getAnalysis(); + LegacyDivergenceAnalysis &DA = getAnalysis(); // Loop over all of the blocks in a function, tracking all of the blocks that // return. Index: llvm/trunk/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -16,7 +16,7 @@ #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/BasicBlock.h" @@ -52,7 +52,7 @@ using StackVector = SmallVector; class SIAnnotateControlFlow : public FunctionPass { - DivergenceAnalysis *DA; + LegacyDivergenceAnalysis *DA; Type *Boolean; Type *Void; @@ -116,7 +116,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); - AU.addRequired(); + AU.addRequired(); AU.addPreserved(); FunctionPass::getAnalysisUsage(AU); } @@ -127,7 +127,7 @@ INITIALIZE_PASS_BEGIN(SIAnnotateControlFlow, DEBUG_TYPE, "Annotate SI Control Flow", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) INITIALIZE_PASS_END(SIAnnotateControlFlow, DEBUG_TYPE, "Annotate SI Control Flow", false, false) @@ -387,7 +387,7 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) { DT = &getAnalysis().getDomTree(); LI = &getAnalysis().getLoopInfo(); - DA = &getAnalysis(); + DA = &getAnalysis(); for (df_iterator I = df_begin(&F.getEntryBlock()), E = df_end(&F.getEntryBlock()); I != E; ++I) { Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h @@ -339,7 +339,7 @@ unsigned Depth = 0) const override; bool isSDNodeSourceOfDivergence(const SDNode *N, - FunctionLoweringInfo *FLI, DivergenceAnalysis *DA) const override; + FunctionLoweringInfo *FLI, LegacyDivergenceAnalysis *DA) const override; bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth = 5) const; Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9166,7 +9166,7 @@ } bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N, - FunctionLoweringInfo * FLI, DivergenceAnalysis * DA) const + FunctionLoweringInfo * FLI, LegacyDivergenceAnalysis * KDA) const { switch (N->getOpcode()) { case ISD::Register: @@ -9199,7 +9199,7 @@ else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv())) return true; } - return !DA || DA->isDivergent(FLI->getValueFromVirtualReg(Reg)); + return !KDA || KDA->isDivergent(FLI->getValueFromVirtualReg(Reg)); } } break; Index: llvm/trunk/lib/Transforms/Scalar/LoopUnswitch.cpp =================================================================== --- llvm/trunk/lib/Transforms/Scalar/LoopUnswitch.cpp +++ llvm/trunk/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -33,7 +33,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" -#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" @@ -215,7 +215,7 @@ AU.addRequired(); AU.addRequired(); if (hasBranchDivergence) - AU.addRequired(); + AU.addRequired(); getLoopAnalysisUsage(AU); } @@ -383,7 +383,7 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(LoopPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) INITIALIZE_PASS_END(LoopUnswitch, "loop-unswitch", "Unswitch loops", false, false) @@ -864,7 +864,7 @@ return false; } if (hasBranchDivergence && - getAnalysis().isDivergent(LoopCond)) { + getAnalysis().isDivergent(LoopCond)) { LLVM_DEBUG(dbgs() << "NOT unswitching loop %" << currentLoop->getHeader()->getName() << " at non-trivial condition '" << *Val Index: llvm/trunk/lib/Transforms/Scalar/StructurizeCFG.cpp =================================================================== --- llvm/trunk/lib/Transforms/Scalar/StructurizeCFG.cpp +++ llvm/trunk/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -13,7 +13,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/RegionInfo.h" #include "llvm/Analysis/RegionIterator.h" @@ -183,7 +183,7 @@ Function *Func; Region *ParentRegion; - DivergenceAnalysis *DA; + LegacyDivergenceAnalysis *DA; DominatorTree *DT; LoopInfo *LI; @@ -269,7 +269,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { if (SkipUniformRegions) - AU.addRequired(); + AU.addRequired(); AU.addRequiredID(LowerSwitchID); AU.addRequired(); AU.addRequired(); @@ -285,7 +285,7 @@ INITIALIZE_PASS_BEGIN(StructurizeCFG, "structurizecfg", "Structurize the CFG", false, false) -INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) INITIALIZE_PASS_DEPENDENCY(LowerSwitch) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(RegionInfoPass) @@ -914,7 +914,7 @@ } static bool hasOnlyUniformBranches(Region *R, unsigned UniformMDKindID, - const DivergenceAnalysis &DA) { + const LegacyDivergenceAnalysis &DA) { for (auto E : R->elements()) { if (!E->isSubRegion()) { auto Br = dyn_cast(E->getEntry()->getTerminator()); @@ -962,7 +962,7 @@ // but we shouldn't rely on metadata for correctness! unsigned UniformMDKindID = R->getEntry()->getContext().getMDKindID("structurizecfg.uniform"); - DA = &getAnalysis(); + DA = &getAnalysis(); if (hasOnlyUniformBranches(R, UniformMDKindID, *DA)) { LLVM_DEBUG(dbgs() << "Skipping region with uniform control flow: " << *R Index: llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll =================================================================== --- llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll +++ llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll @@ -1,45 +0,0 @@ -; RUN: opt -mtriple=amdgcn-- -analyze -divergence %s | FileCheck %s - -; CHECK: DIVERGENT: %orig = atomicrmw xchg i32* %ptr, i32 %val seq_cst -define i32 @test1(i32* %ptr, i32 %val) #0 { - %orig = atomicrmw xchg i32* %ptr, i32 %val seq_cst - ret i32 %orig -} - -; CHECK: DIVERGENT: %orig = cmpxchg i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst -define {i32, i1} @test2(i32* %ptr, i32 %cmp, i32 %new) { - %orig = cmpxchg i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst - ret {i32, i1} %orig -} - -; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false) -define i32 @test_atomic_inc_i32(i32 addrspace(1)* %ptr, i32 %val) #0 { - %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false) - ret i32 %ret -} - -; CHECK: DIVERGENT: %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false) -define i64 @test_atomic_inc_i64(i64 addrspace(1)* %ptr, i64 %val) #0 { - %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false) - ret i64 %ret -} - -; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false) -define i32 @test_atomic_dec_i32(i32 addrspace(1)* %ptr, i32 %val) #0 { - %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false) - ret i32 %ret -} - -; CHECK: DIVERGENT: %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false) -define i64 @test_atomic_dec_i64(i64 addrspace(1)* %ptr, i64 %val) #0 { - %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false) - ret i64 %ret -} - -declare i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32, i32, i1) #1 -declare i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32, i32, i1) #1 -declare i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32, i32, i1) #1 -declare i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32, i32, i1) #1 - -attributes #0 = { nounwind } -attributes #1 = { nounwind argmemonly } Index: llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll =================================================================== --- llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll +++ llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll @@ -1,13 +0,0 @@ -; RUN: opt -mtriple=amdgcn-- -analyze -divergence %s | FileCheck %s - -; CHECK: DIVERGENT: %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0 -define amdgpu_kernel void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) #0 { - %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0 - store i32 %swizzle, i32 addrspace(1)* %out, align 4 - ret void -} - -declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1 - -attributes #0 = { nounwind convergent } -attributes #1 = { nounwind readnone convergent } Index: llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll =================================================================== --- llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll +++ llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll @@ -1,41 +0,0 @@ -; RUN: opt %s -mtriple amdgcn-- -analyze -divergence | FileCheck %s - -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'test_amdgpu_ps': -; CHECK: DIVERGENT: -; CHECK-NOT: %arg0 -; CHECK-NOT: %arg1 -; CHECK-NOT: %arg2 -; CHECK: <2 x i32> %arg3 -; CHECK: DIVERGENT: <3 x i32> %arg4 -; CHECK: DIVERGENT: float %arg5 -; CHECK: DIVERGENT: i32 %arg6 - -define amdgpu_ps void @test_amdgpu_ps([4 x <16 x i8>] addrspace(2)* byval %arg0, float inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <3 x i32> %arg4, float %arg5, i32 %arg6) #0 { - ret void -} - -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'test_amdgpu_kernel': -; CHECK-NOT: %arg0 -; CHECK-NOT: %arg1 -; CHECK-NOT: %arg2 -; CHECK-NOT: %arg3 -; CHECK-NOT: %arg4 -; CHECK-NOT: %arg5 -; CHECK-NOT: %arg6 -define amdgpu_kernel void @test_amdgpu_kernel([4 x <16 x i8>] addrspace(2)* byval %arg0, float inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <3 x i32> %arg4, float %arg5, i32 %arg6) #0 { - ret void -} - -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'test_c': -; CHECK: DIVERGENT: -; CHECK: DIVERGENT: -; CHECK: DIVERGENT: -; CHECK: DIVERGENT: -; CHECK: DIVERGENT: -; CHECK: DIVERGENT: -; CHECK: DIVERGENT: -define void @test_c([4 x <16 x i8>] addrspace(2)* byval %arg0, float inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <3 x i32> %arg4, float %arg5, i32 %arg6) #0 { - ret void -} - -attributes #0 = { nounwind } Index: llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/lit.local.cfg =================================================================== --- llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/lit.local.cfg +++ llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/lit.local.cfg @@ -1,2 +0,0 @@ -if not 'AMDGPU' in config.root.targets: - config.unsupported = True Index: llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll =================================================================== --- llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll +++ llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll @@ -1,103 +0,0 @@ -;RUN: opt -mtriple=amdgcn-mesa-mesa3d -analyze -divergence %s | FileCheck %s - -;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.swap( -define float @buffer_atomic_swap(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) - %r = bitcast i32 %orig to float - ret float %r -} - -;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.add( -define float @buffer_atomic_add(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.add(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) - %r = bitcast i32 %orig to float - ret float %r -} - -;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.sub( -define float @buffer_atomic_sub(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.sub(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) - %r = bitcast i32 %orig to float - ret float %r -} - -;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.smin( -define float @buffer_atomic_smin(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.smin(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) - %r = bitcast i32 %orig to float - ret float %r -} - -;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.umin( -define float @buffer_atomic_umin(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.umin(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) - %r = bitcast i32 %orig to float - ret float %r -} - -;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.smax( -define float @buffer_atomic_smax(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.smax(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) - %r = bitcast i32 %orig to float - ret float %r -} - -;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.umax( -define float @buffer_atomic_umax(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.umax(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) - %r = bitcast i32 %orig to float - ret float %r -} - -;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.and( -define float @buffer_atomic_and(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.and(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) - %r = bitcast i32 %orig to float - ret float %r -} - -;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.or( -define float @buffer_atomic_or(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.or(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) - %r = bitcast i32 %orig to float - ret float %r -} - -;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.xor( -define float @buffer_atomic_xor(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.xor(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) - %r = bitcast i32 %orig to float - ret float %r -} - -;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.cmpswap( -define float @buffer_atomic_cmpswap(<4 x i32> inreg %rsrc, i32 inreg %data, i32 inreg %cmp) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0) - %r = bitcast i32 %orig to float - ret float %r -} - -declare i32 @llvm.amdgcn.buffer.atomic.swap(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.add(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.smin(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.umin(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.smax(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.umax(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.and(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.or(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.xor(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i1) #0 - -attributes #0 = { nounwind } Index: llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll =================================================================== --- llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll +++ llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll @@ -1,131 +0,0 @@ -;RUN: opt -mtriple=amdgcn-mesa-mesa3d -analyze -divergence %s | FileCheck %s - -;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32( -define float @image_atomic_swap(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) - %r = bitcast i32 %orig to float - ret float %r -} - -;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32( -define float @image_atomic_add(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) - %r = bitcast i32 %orig to float - ret float %r -} - -;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32( -define float @image_atomic_sub(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) - %r = bitcast i32 %orig to float - ret float %r -} - -;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32( -define float @image_atomic_smin(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) - %r = bitcast i32 %orig to float - ret float %r -} - -;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32( -define float @image_atomic_umin(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) - %r = bitcast i32 %orig to float - ret float %r -} - -;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32( -define float @image_atomic_smax(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) - %r = bitcast i32 %orig to float - ret float %r -} - -;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32( -define float @image_atomic_umax(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) - %r = bitcast i32 %orig to float - ret float %r -} - -;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32( -define float @image_atomic_and(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) - %r = bitcast i32 %orig to float - ret float %r -} - -;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32( -define float @image_atomic_or(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) - %r = bitcast i32 %orig to float - ret float %r -} - -;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32( -define float @image_atomic_xor(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) - %r = bitcast i32 %orig to float - ret float %r -} - -;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32( -define float @image_atomic_inc(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) - %r = bitcast i32 %orig to float - ret float %r -} - -;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32( -define float @image_atomic_dec(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) - %r = bitcast i32 %orig to float - ret float %r -} - -;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32( -define float @image_atomic_cmpswap(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data, i32 inreg %cmp) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %data, i32 %cmp, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) - %r = bitcast i32 %orig to float - ret float %r -} - -;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32( -define float @image_atomic_add_2d(<8 x i32> inreg %rsrc, i32 inreg %s, i32 inreg %t, i32 inreg %data) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) - %r = bitcast i32 %orig to float - ret float %r -} - -declare i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 -declare i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 -declare i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 -declare i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 -declare i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 -declare i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 -declare i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 -declare i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 -declare i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 -declare i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 -declare i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 -declare i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 -declare i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #0 - -declare i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #0 - -attributes #0 = { nounwind } Index: llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll =================================================================== --- llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll +++ llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll @@ -1,30 +0,0 @@ -; RUN: opt %s -mtriple amdgcn-- -analyze -divergence | FileCheck %s - -; CHECK: DIVERGENT: %tmp5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp2 -; CHECK: DIVERGENT: %tmp10 = load volatile float, float addrspace(1)* %tmp5, align 4 -; CHECK: DIVERGENT: %tmp11 = load volatile float, float addrspace(1)* %tmp5, align 4 - -; The post dominator tree does not have a root node in this case -define amdgpu_kernel void @no_return_blocks(float addrspace(1)* noalias nocapture readonly %arg, float addrspace(1)* noalias nocapture readonly %arg1) #0 { -bb0: - %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0 - %tmp2 = sext i32 %tmp to i64 - %tmp5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp2 - %tmp6 = load volatile float, float addrspace(1)* %tmp5, align 4 - %tmp8 = fcmp olt float %tmp6, 0.000000e+00 - br i1 %tmp8, label %bb1, label %bb2 - -bb1: - %tmp10 = load volatile float, float addrspace(1)* %tmp5, align 4 - br label %bb2 - -bb2: - %tmp11 = load volatile float, float addrspace(1)* %tmp5, align 4 - br label %bb1 -} - -; Function Attrs: nounwind readnone -declare i32 @llvm.amdgcn.workitem.id.x() #1 - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } Index: llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/phi-undef.ll =================================================================== --- llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/phi-undef.ll +++ llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/phi-undef.ll @@ -1,31 +0,0 @@ -; RUN: opt -mtriple=amdgcn-- -analyze -divergence %s | FileCheck %s - -; CHECK-LABEL: 'test1': -; CHECK-NEXT: DIVERGENT: i32 %bound -; CHECK: {{^ *}}%counter = -; CHECK-NEXT: DIVERGENT: %break = icmp sge i32 %counter, %bound -; CHECK-NEXT: DIVERGENT: br i1 %break, label %footer, label %body -; CHECK: {{^ *}}%counter.next = -; CHECK: {{^ *}}%counter.footer = -; CHECK: DIVERGENT: br i1 %break, label %end, label %header -; Note: %counter is not divergent! -define amdgpu_ps void @test1(i32 %bound) { -entry: - br label %header - -header: - %counter = phi i32 [ 0, %entry ], [ %counter.footer, %footer ] - %break = icmp sge i32 %counter, %bound - br i1 %break, label %footer, label %body - -body: - %counter.next = add i32 %counter, 1 - br label %footer - -footer: - %counter.footer = phi i32 [ %counter.next, %body ], [ undef, %header ] - br i1 %break, label %end, label %header - -end: - ret void -} Index: llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/unreachable-loop-block.ll =================================================================== --- llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/unreachable-loop-block.ll +++ llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/unreachable-loop-block.ll @@ -1,17 +0,0 @@ -; RUN: opt %s -mtriple amdgcn-- -analyze -divergence | FileCheck %s - -; CHECK: DIVERGENT: %tmp = cmpxchg volatile -define amdgpu_kernel void @unreachable_loop(i32 %tidx) #0 { -entry: - unreachable - -unreachable_loop: ; preds = %do.body.i, %if.then11 - %tmp = cmpxchg volatile i32 addrspace(1)* null, i32 0, i32 0 seq_cst seq_cst - %cmp.i = extractvalue { i32, i1 } %tmp, 1 - br i1 %cmp.i, label %unreachable_loop, label %end - -end: ; preds = %do.body.i51, %atomicAdd_g_f.exit - unreachable -} - -attributes #0 = { norecurse nounwind } Index: llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll =================================================================== --- llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll +++ llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll @@ -1,45 +0,0 @@ -; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence %s | FileCheck %s - -declare i32 @llvm.amdgcn.workitem.id.x() #0 -declare i32 @llvm.amdgcn.workitem.id.y() #0 -declare i32 @llvm.amdgcn.workitem.id.z() #0 -declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0 -declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0 - -; CHECK: DIVERGENT: %id.x = call i32 @llvm.amdgcn.workitem.id.x() -define amdgpu_kernel void @workitem_id_x() #1 { - %id.x = call i32 @llvm.amdgcn.workitem.id.x() - store volatile i32 %id.x, i32 addrspace(1)* undef - ret void -} - -; CHECK: DIVERGENT: %id.y = call i32 @llvm.amdgcn.workitem.id.y() -define amdgpu_kernel void @workitem_id_y() #1 { - %id.y = call i32 @llvm.amdgcn.workitem.id.y() - store volatile i32 %id.y, i32 addrspace(1)* undef - ret void -} - -; CHECK: DIVERGENT: %id.z = call i32 @llvm.amdgcn.workitem.id.z() -define amdgpu_kernel void @workitem_id_z() #1 { - %id.z = call i32 @llvm.amdgcn.workitem.id.z() - store volatile i32 %id.z, i32 addrspace(1)* undef - ret void -} - -; CHECK: DIVERGENT: %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 0, i32 0) -define amdgpu_kernel void @mbcnt_lo() #1 { - %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 0, i32 0) - store volatile i32 %mbcnt.lo, i32 addrspace(1)* undef - ret void -} - -; CHECK: DIVERGENT: %mbcnt.hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0) -define amdgpu_kernel void @mbcnt_hi() #1 { - %mbcnt.hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0) - store volatile i32 %mbcnt.hi, i32 addrspace(1)* undef - ret void -} - -attributes #0 = { nounwind readnone } -attributes #1 = { nounwind } Index: llvm/trunk/test/Analysis/DivergenceAnalysis/NVPTX/diverge.ll =================================================================== --- llvm/trunk/test/Analysis/DivergenceAnalysis/NVPTX/diverge.ll +++ llvm/trunk/test/Analysis/DivergenceAnalysis/NVPTX/diverge.ll @@ -1,219 +0,0 @@ -; RUN: opt %s -analyze -divergence | FileCheck %s - -target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" -target triple = "nvptx64-nvidia-cuda" - -; return (n < 0 ? a + threadIdx.x : b + threadIdx.x) -define i32 @no_diverge(i32 %n, i32 %a, i32 %b) { -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'no_diverge' -entry: - %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - %cond = icmp slt i32 %n, 0 - br i1 %cond, label %then, label %else ; uniform -; CHECK-NOT: DIVERGENT: br i1 %cond, -then: - %a1 = add i32 %a, %tid - br label %merge -else: - %b2 = add i32 %b, %tid - br label %merge -merge: - %c = phi i32 [ %a1, %then ], [ %b2, %else ] - ret i32 %c -} - -; c = a; -; if (threadIdx.x < 5) // divergent: data dependent -; c = b; -; return c; // c is divergent: sync dependent -define i32 @sync(i32 %a, i32 %b) { -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'sync' -bb1: - %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.y() - %cond = icmp slt i32 %tid, 5 - br i1 %cond, label %bb2, label %bb3 -; CHECK: DIVERGENT: br i1 %cond, -bb2: - br label %bb3 -bb3: - %c = phi i32 [ %a, %bb1 ], [ %b, %bb2 ] ; sync dependent on tid -; CHECK: DIVERGENT: %c = - ret i32 %c -} - -; c = 0; -; if (threadIdx.x >= 5) { // divergent -; c = (n < 0 ? a : b); // c here is uniform because n is uniform -; } -; // c here is divergent because it is sync dependent on threadIdx.x >= 5 -; return c; -define i32 @mixed(i32 %n, i32 %a, i32 %b) { -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'mixed' -bb1: - %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.z() - %cond = icmp slt i32 %tid, 5 - br i1 %cond, label %bb6, label %bb2 -; CHECK: DIVERGENT: br i1 %cond, -bb2: - %cond2 = icmp slt i32 %n, 0 - br i1 %cond2, label %bb4, label %bb3 -bb3: - br label %bb5 -bb4: - br label %bb5 -bb5: - %c = phi i32 [ %a, %bb3 ], [ %b, %bb4 ] -; CHECK-NOT: DIVERGENT: %c = - br label %bb6 -bb6: - %c2 = phi i32 [ 0, %bb1], [ %c, %bb5 ] -; CHECK: DIVERGENT: %c2 = - ret i32 %c2 -} - -; We conservatively treats all parameters of a __device__ function as divergent. -define i32 @device(i32 %n, i32 %a, i32 %b) { -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'device' -; CHECK: DIVERGENT: i32 %n -; CHECK: DIVERGENT: i32 %a -; CHECK: DIVERGENT: i32 %b -entry: - %cond = icmp slt i32 %n, 0 - br i1 %cond, label %then, label %else -; CHECK: DIVERGENT: br i1 %cond, -then: - br label %merge -else: - br label %merge -merge: - %c = phi i32 [ %a, %then ], [ %b, %else ] - ret i32 %c -} - -; int i = 0; -; do { -; i++; // i here is uniform -; } while (i < laneid); -; return i == 10 ? 0 : 1; // i here is divergent -; -; The i defined in the loop is used outside. -define i32 @loop() { -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'loop' -entry: - %laneid = call i32 @llvm.nvvm.read.ptx.sreg.laneid() - br label %loop -loop: - %i = phi i32 [ 0, %entry ], [ %i1, %loop ] -; CHECK-NOT: DIVERGENT: %i = - %i1 = add i32 %i, 1 - %exit_cond = icmp sge i32 %i1, %laneid - br i1 %exit_cond, label %loop_exit, label %loop -loop_exit: - %cond = icmp eq i32 %i, 10 - br i1 %cond, label %then, label %else -; CHECK: DIVERGENT: br i1 %cond, -then: - ret i32 0 -else: - ret i32 1 -} - -; Same as @loop, but the loop is in the LCSSA form. -define i32 @lcssa() { -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'lcssa' -entry: - %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - br label %loop -loop: - %i = phi i32 [ 0, %entry ], [ %i1, %loop ] -; CHECK-NOT: DIVERGENT: %i = - %i1 = add i32 %i, 1 - %exit_cond = icmp sge i32 %i1, %tid - br i1 %exit_cond, label %loop_exit, label %loop -loop_exit: - %i.lcssa = phi i32 [ %i, %loop ] -; CHECK: DIVERGENT: %i.lcssa = - %cond = icmp eq i32 %i.lcssa, 10 - br i1 %cond, label %then, label %else -; CHECK: DIVERGENT: br i1 %cond, -then: - ret i32 0 -else: - ret i32 1 -} - -; This test contains an unstructured loop. -; +-------------- entry ----------------+ -; | | -; V V -; i1 = phi(0, i3) i2 = phi(0, i3) -; j1 = i1 + 1 ---> i3 = phi(j1, j2) <--- j2 = i2 + 2 -; ^ | ^ -; | V | -; +-------- switch (tid / i3) ----------+ -; | -; V -; if (i3 == 5) // divergent -; because sync dependent on (tid / i3). -define i32 @unstructured_loop(i1 %entry_cond) { -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'unstructured_loop' -entry: - %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - br i1 %entry_cond, label %loop_entry_1, label %loop_entry_2 -loop_entry_1: - %i1 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ] - %j1 = add i32 %i1, 1 - br label %loop_body -loop_entry_2: - %i2 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ] - %j2 = add i32 %i2, 2 - br label %loop_body -loop_body: - %i3 = phi i32 [ %j1, %loop_entry_1 ], [ %j2, %loop_entry_2 ] - br label %loop_latch -loop_latch: - %div = sdiv i32 %tid, %i3 - switch i32 %div, label %branch [ i32 1, label %loop_entry_1 - i32 2, label %loop_entry_2 ] -branch: - %cmp = icmp eq i32 %i3, 5 - br i1 %cmp, label %then, label %else -; CHECK: DIVERGENT: br i1 %cmp, -then: - ret i32 0 -else: - ret i32 1 -} - -; Verifies sync-dependence is computed correctly in the absense of loops. -define i32 @sync_no_loop(i32 %arg) { -entry: - %0 = add i32 %arg, 1 - %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - %1 = icmp sge i32 %tid, 10 - br i1 %1, label %bb1, label %bb2 - -bb1: - br label %bb3 - -bb2: - br label %bb3 - -bb3: - %2 = add i32 %0, 2 - ; CHECK-NOT: DIVERGENT: %2 - ret i32 %2 -} - -declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() -declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() -declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() -declare i32 @llvm.nvvm.read.ptx.sreg.laneid() - -!nvvm.annotations = !{!0, !1, !2, !3, !4, !5} -!0 = !{i32 (i32, i32, i32)* @no_diverge, !"kernel", i32 1} -!1 = !{i32 (i32, i32)* @sync, !"kernel", i32 1} -!2 = !{i32 (i32, i32, i32)* @mixed, !"kernel", i32 1} -!3 = !{i32 ()* @loop, !"kernel", i32 1} -!4 = !{i32 (i1)* @unstructured_loop, !"kernel", i32 1} -!5 = !{i32 (i32)* @sync_no_loop, !"kernel", i32 1} Index: llvm/trunk/test/Analysis/DivergenceAnalysis/NVPTX/lit.local.cfg =================================================================== --- llvm/trunk/test/Analysis/DivergenceAnalysis/NVPTX/lit.local.cfg +++ llvm/trunk/test/Analysis/DivergenceAnalysis/NVPTX/lit.local.cfg @@ -1,2 +0,0 @@ -if not 'NVPTX' in config.root.targets: - config.unsupported = True Index: llvm/trunk/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/atomics.ll =================================================================== --- llvm/trunk/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/atomics.ll +++ llvm/trunk/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/atomics.ll @@ -0,0 +1,45 @@ +; RUN: opt -mtriple=amdgcn-- -analyze -divergence %s | FileCheck %s + +; CHECK: DIVERGENT: %orig = atomicrmw xchg i32* %ptr, i32 %val seq_cst +define i32 @test1(i32* %ptr, i32 %val) #0 { + %orig = atomicrmw xchg i32* %ptr, i32 %val seq_cst + ret i32 %orig +} + +; CHECK: DIVERGENT: %orig = cmpxchg i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst +define {i32, i1} @test2(i32* %ptr, i32 %cmp, i32 %new) { + %orig = cmpxchg i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst + ret {i32, i1} %orig +} + +; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false) +define i32 @test_atomic_inc_i32(i32 addrspace(1)* %ptr, i32 %val) #0 { + %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false) + ret i32 %ret +} + +; CHECK: DIVERGENT: %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false) +define i64 @test_atomic_inc_i64(i64 addrspace(1)* %ptr, i64 %val) #0 { + %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false) + ret i64 %ret +} + +; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false) +define i32 @test_atomic_dec_i32(i32 addrspace(1)* %ptr, i32 %val) #0 { + %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false) + ret i32 %ret +} + +; CHECK: DIVERGENT: %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false) +define i64 @test_atomic_dec_i64(i64 addrspace(1)* %ptr, i64 %val) #0 { + %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false) + ret i64 %ret +} + +declare i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32, i32, i1) #1 +declare i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32, i32, i1) #1 +declare i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32, i32, i1) #1 +declare i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32, i32, i1) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind argmemonly } Index: llvm/trunk/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/intrinsics.ll =================================================================== --- llvm/trunk/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/intrinsics.ll +++ llvm/trunk/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/intrinsics.ll @@ -0,0 +1,13 @@ +; RUN: opt -mtriple=amdgcn-- -analyze -divergence %s | FileCheck %s + +; CHECK: DIVERGENT: %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0 +define amdgpu_kernel void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) #0 { + %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0 + store i32 %swizzle, i32 addrspace(1)* %out, align 4 + ret void +} + +declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1 + +attributes #0 = { nounwind convergent } +attributes #1 = { nounwind readnone convergent } Index: llvm/trunk/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/kernel-args.ll =================================================================== --- llvm/trunk/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/kernel-args.ll +++ llvm/trunk/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/kernel-args.ll @@ -0,0 +1,41 @@ +; RUN: opt %s -mtriple amdgcn-- -analyze -divergence | FileCheck %s + +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'test_amdgpu_ps': +; CHECK: DIVERGENT: +; CHECK-NOT: %arg0 +; CHECK-NOT: %arg1 +; CHECK-NOT: %arg2 +; CHECK: <2 x i32> %arg3 +; CHECK: DIVERGENT: <3 x i32> %arg4 +; CHECK: DIVERGENT: float %arg5 +; CHECK: DIVERGENT: i32 %arg6 + +define amdgpu_ps void @test_amdgpu_ps([4 x <16 x i8>] addrspace(2)* byval %arg0, float inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <3 x i32> %arg4, float %arg5, i32 %arg6) #0 { + ret void +} + +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'test_amdgpu_kernel': +; CHECK-NOT: %arg0 +; CHECK-NOT: %arg1 +; CHECK-NOT: %arg2 +; CHECK-NOT: %arg3 +; CHECK-NOT: %arg4 +; CHECK-NOT: %arg5 +; CHECK-NOT: %arg6 +define amdgpu_kernel void @test_amdgpu_kernel([4 x <16 x i8>] addrspace(2)* byval %arg0, float inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <3 x i32> %arg4, float %arg5, i32 %arg6) #0 { + ret void +} + +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'test_c': +; CHECK: DIVERGENT: +; CHECK: DIVERGENT: +; CHECK: DIVERGENT: +; CHECK: DIVERGENT: +; CHECK: DIVERGENT: +; CHECK: DIVERGENT: +; CHECK: DIVERGENT: +define void @test_c([4 x <16 x i8>] addrspace(2)* byval %arg0, float inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <3 x i32> %arg4, float %arg5, i32 %arg6) #0 { + ret void +} + +attributes #0 = { nounwind } Index: llvm/trunk/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/lit.local.cfg =================================================================== --- llvm/trunk/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/lit.local.cfg +++ llvm/trunk/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'AMDGPU' in config.root.targets: + config.unsupported = True Index: llvm/trunk/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll =================================================================== --- llvm/trunk/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll +++ llvm/trunk/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll @@ -0,0 +1,103 @@ +;RUN: opt -mtriple=amdgcn-mesa-mesa3d -analyze -divergence %s | FileCheck %s + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.swap( +define float @buffer_atomic_swap(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.add( +define float @buffer_atomic_add(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.add(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.sub( +define float @buffer_atomic_sub(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.sub(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.smin( +define float @buffer_atomic_smin(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.smin(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.umin( +define float @buffer_atomic_umin(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.umin(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.smax( +define float @buffer_atomic_smax(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.smax(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.umax( +define float @buffer_atomic_umax(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.umax(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.and( +define float @buffer_atomic_and(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.and(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.or( +define float @buffer_atomic_or(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.or(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.xor( +define float @buffer_atomic_xor(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.xor(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.cmpswap( +define float @buffer_atomic_cmpswap(<4 x i32> inreg %rsrc, i32 inreg %data, i32 inreg %cmp) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +declare i32 @llvm.amdgcn.buffer.atomic.swap(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.add(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.smin(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.umin(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.smax(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.umax(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.and(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.or(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.xor(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i1) #0 + +attributes #0 = { nounwind } Index: llvm/trunk/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll =================================================================== --- llvm/trunk/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll +++ llvm/trunk/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll @@ -0,0 +1,131 @@ +;RUN: opt -mtriple=amdgcn-mesa-mesa3d -analyze -divergence %s | FileCheck %s + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32( +define float @image_atomic_swap(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32( +define float @image_atomic_add(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32( +define float @image_atomic_sub(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32( +define float @image_atomic_smin(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32( +define float @image_atomic_umin(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32( +define float @image_atomic_smax(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32( +define float @image_atomic_umax(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32( +define float @image_atomic_and(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32( +define float @image_atomic_or(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32( +define float @image_atomic_xor(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32( +define float @image_atomic_inc(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32( +define float @image_atomic_dec(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32( +define float @image_atomic_cmpswap(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data, i32 inreg %cmp) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %data, i32 %cmp, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32( +define float @image_atomic_add_2d(<8 x i32> inreg %rsrc, i32 inreg %s, i32 inreg %t, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +declare i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #0 + +declare i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #0 + +attributes #0 = { nounwind } Index: llvm/trunk/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/no-return-blocks.ll =================================================================== --- llvm/trunk/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/no-return-blocks.ll +++ llvm/trunk/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/no-return-blocks.ll @@ -0,0 +1,30 @@ +; RUN: opt %s -mtriple amdgcn-- -analyze -divergence | FileCheck %s + +; CHECK: DIVERGENT: %tmp5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp2 +; CHECK: DIVERGENT: %tmp10 = load volatile float, float addrspace(1)* %tmp5, align 4 +; CHECK: DIVERGENT: %tmp11 = load volatile float, float addrspace(1)* %tmp5, align 4 + +; The post dominator tree does not have a root node in this case +define amdgpu_kernel void @no_return_blocks(float addrspace(1)* noalias nocapture readonly %arg, float addrspace(1)* noalias nocapture readonly %arg1) #0 { +bb0: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %tmp2 = sext i32 %tmp to i64 + %tmp5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp2 + %tmp6 = load volatile float, float addrspace(1)* %tmp5, align 4 + %tmp8 = fcmp olt float %tmp6, 0.000000e+00 + br i1 %tmp8, label %bb1, label %bb2 + +bb1: + %tmp10 = load volatile float, float addrspace(1)* %tmp5, align 4 + br label %bb2 + +bb2: + %tmp11 = load volatile float, float addrspace(1)* %tmp5, align 4 + br label %bb1 +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } Index: llvm/trunk/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/phi-undef.ll =================================================================== --- llvm/trunk/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/phi-undef.ll +++ llvm/trunk/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/phi-undef.ll @@ -0,0 +1,31 @@ +; RUN: opt -mtriple=amdgcn-- -analyze -divergence %s | FileCheck %s + +; CHECK-LABEL: 'test1': +; CHECK-NEXT: DIVERGENT: i32 %bound +; CHECK: {{^ *}}%counter = +; CHECK-NEXT: DIVERGENT: %break = icmp sge i32 %counter, %bound +; CHECK-NEXT: DIVERGENT: br i1 %break, label %footer, label %body +; CHECK: {{^ *}}%counter.next = +; CHECK: {{^ *}}%counter.footer = +; CHECK: DIVERGENT: br i1 %break, label %end, label %header +; Note: %counter is not divergent! +define amdgpu_ps void @test1(i32 %bound) { +entry: + br label %header + +header: + %counter = phi i32 [ 0, %entry ], [ %counter.footer, %footer ] + %break = icmp sge i32 %counter, %bound + br i1 %break, label %footer, label %body + +body: + %counter.next = add i32 %counter, 1 + br label %footer + +footer: + %counter.footer = phi i32 [ %counter.next, %body ], [ undef, %header ] + br i1 %break, label %end, label %header + +end: + ret void +} Index: llvm/trunk/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/unreachable-loop-block.ll =================================================================== --- llvm/trunk/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/unreachable-loop-block.ll +++ llvm/trunk/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/unreachable-loop-block.ll @@ -0,0 +1,17 @@ +; RUN: opt %s -mtriple amdgcn-- -analyze -divergence | FileCheck %s + +; CHECK: DIVERGENT: %tmp = cmpxchg volatile +define amdgpu_kernel void @unreachable_loop(i32 %tidx) #0 { +entry: + unreachable + +unreachable_loop: ; preds = %do.body.i, %if.then11 + %tmp = cmpxchg volatile i32 addrspace(1)* null, i32 0, i32 0 seq_cst seq_cst + %cmp.i = extractvalue { i32, i1 } %tmp, 1 + br i1 %cmp.i, label %unreachable_loop, label %end + +end: ; preds = %do.body.i51, %atomicAdd_g_f.exit + unreachable +} + +attributes #0 = { norecurse nounwind } Index: llvm/trunk/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/workitem-intrinsics.ll =================================================================== --- llvm/trunk/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/workitem-intrinsics.ll +++ llvm/trunk/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/workitem-intrinsics.ll @@ -0,0 +1,45 @@ +; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence %s | FileCheck %s + +declare i32 @llvm.amdgcn.workitem.id.x() #0 +declare i32 @llvm.amdgcn.workitem.id.y() #0 +declare i32 @llvm.amdgcn.workitem.id.z() #0 +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0 +declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0 + +; CHECK: DIVERGENT: %id.x = call i32 @llvm.amdgcn.workitem.id.x() +define amdgpu_kernel void @workitem_id_x() #1 { + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %id.x, i32 addrspace(1)* undef + ret void +} + +; CHECK: DIVERGENT: %id.y = call i32 @llvm.amdgcn.workitem.id.y() +define amdgpu_kernel void @workitem_id_y() #1 { + %id.y = call i32 @llvm.amdgcn.workitem.id.y() + store volatile i32 %id.y, i32 addrspace(1)* undef + ret void +} + +; CHECK: DIVERGENT: %id.z = call i32 @llvm.amdgcn.workitem.id.z() +define amdgpu_kernel void @workitem_id_z() #1 { + %id.z = call i32 @llvm.amdgcn.workitem.id.z() + store volatile i32 %id.z, i32 addrspace(1)* undef + ret void +} + +; CHECK: DIVERGENT: %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 0, i32 0) +define amdgpu_kernel void @mbcnt_lo() #1 { + %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 0, i32 0) + store volatile i32 %mbcnt.lo, i32 addrspace(1)* undef + ret void +} + +; CHECK: DIVERGENT: %mbcnt.hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0) +define amdgpu_kernel void @mbcnt_hi() #1 { + %mbcnt.hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0) + store volatile i32 %mbcnt.hi, i32 addrspace(1)* undef + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } Index: llvm/trunk/test/Analysis/LegacyDivergenceAnalysis/NVPTX/diverge.ll =================================================================== --- llvm/trunk/test/Analysis/LegacyDivergenceAnalysis/NVPTX/diverge.ll +++ llvm/trunk/test/Analysis/LegacyDivergenceAnalysis/NVPTX/diverge.ll @@ -0,0 +1,219 @@ +; RUN: opt %s -analyze -divergence | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; return (n < 0 ? a + threadIdx.x : b + threadIdx.x) +define i32 @no_diverge(i32 %n, i32 %a, i32 %b) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'no_diverge' +entry: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %cond = icmp slt i32 %n, 0 + br i1 %cond, label %then, label %else ; uniform +; CHECK-NOT: DIVERGENT: br i1 %cond, +then: + %a1 = add i32 %a, %tid + br label %merge +else: + %b2 = add i32 %b, %tid + br label %merge +merge: + %c = phi i32 [ %a1, %then ], [ %b2, %else ] + ret i32 %c +} + +; c = a; +; if (threadIdx.x < 5) // divergent: data dependent +; c = b; +; return c; // c is divergent: sync dependent +define i32 @sync(i32 %a, i32 %b) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'sync' +bb1: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.y() + %cond = icmp slt i32 %tid, 5 + br i1 %cond, label %bb2, label %bb3 +; CHECK: DIVERGENT: br i1 %cond, +bb2: + br label %bb3 +bb3: + %c = phi i32 [ %a, %bb1 ], [ %b, %bb2 ] ; sync dependent on tid +; CHECK: DIVERGENT: %c = + ret i32 %c +} + +; c = 0; +; if (threadIdx.x >= 5) { // divergent +; c = (n < 0 ? a : b); // c here is uniform because n is uniform +; } +; // c here is divergent because it is sync dependent on threadIdx.x >= 5 +; return c; +define i32 @mixed(i32 %n, i32 %a, i32 %b) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'mixed' +bb1: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.z() + %cond = icmp slt i32 %tid, 5 + br i1 %cond, label %bb6, label %bb2 +; CHECK: DIVERGENT: br i1 %cond, +bb2: + %cond2 = icmp slt i32 %n, 0 + br i1 %cond2, label %bb4, label %bb3 +bb3: + br label %bb5 +bb4: + br label %bb5 +bb5: + %c = phi i32 [ %a, %bb3 ], [ %b, %bb4 ] +; CHECK-NOT: DIVERGENT: %c = + br label %bb6 +bb6: + %c2 = phi i32 [ 0, %bb1], [ %c, %bb5 ] +; CHECK: DIVERGENT: %c2 = + ret i32 %c2 +} + +; We conservatively treats all parameters of a __device__ function as divergent. +define i32 @device(i32 %n, i32 %a, i32 %b) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'device' +; CHECK: DIVERGENT: i32 %n +; CHECK: DIVERGENT: i32 %a +; CHECK: DIVERGENT: i32 %b +entry: + %cond = icmp slt i32 %n, 0 + br i1 %cond, label %then, label %else +; CHECK: DIVERGENT: br i1 %cond, +then: + br label %merge +else: + br label %merge +merge: + %c = phi i32 [ %a, %then ], [ %b, %else ] + ret i32 %c +} + +; int i = 0; +; do { +; i++; // i here is uniform +; } while (i < laneid); +; return i == 10 ? 0 : 1; // i here is divergent +; +; The i defined in the loop is used outside. +define i32 @loop() { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'loop' +entry: + %laneid = call i32 @llvm.nvvm.read.ptx.sreg.laneid() + br label %loop +loop: + %i = phi i32 [ 0, %entry ], [ %i1, %loop ] +; CHECK-NOT: DIVERGENT: %i = + %i1 = add i32 %i, 1 + %exit_cond = icmp sge i32 %i1, %laneid + br i1 %exit_cond, label %loop_exit, label %loop +loop_exit: + %cond = icmp eq i32 %i, 10 + br i1 %cond, label %then, label %else +; CHECK: DIVERGENT: br i1 %cond, +then: + ret i32 0 +else: + ret i32 1 +} + +; Same as @loop, but the loop is in the LCSSA form. +define i32 @lcssa() { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'lcssa' +entry: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + br label %loop +loop: + %i = phi i32 [ 0, %entry ], [ %i1, %loop ] +; CHECK-NOT: DIVERGENT: %i = + %i1 = add i32 %i, 1 + %exit_cond = icmp sge i32 %i1, %tid + br i1 %exit_cond, label %loop_exit, label %loop +loop_exit: + %i.lcssa = phi i32 [ %i, %loop ] +; CHECK: DIVERGENT: %i.lcssa = + %cond = icmp eq i32 %i.lcssa, 10 + br i1 %cond, label %then, label %else +; CHECK: DIVERGENT: br i1 %cond, +then: + ret i32 0 +else: + ret i32 1 +} + +; This test contains an unstructured loop. +; +-------------- entry ----------------+ +; | | +; V V +; i1 = phi(0, i3) i2 = phi(0, i3) +; j1 = i1 + 1 ---> i3 = phi(j1, j2) <--- j2 = i2 + 2 +; ^ | ^ +; | V | +; +-------- switch (tid / i3) ----------+ +; | +; V +; if (i3 == 5) // divergent +; because sync dependent on (tid / i3). +define i32 @unstructured_loop(i1 %entry_cond) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'unstructured_loop' +entry: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + br i1 %entry_cond, label %loop_entry_1, label %loop_entry_2 +loop_entry_1: + %i1 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ] + %j1 = add i32 %i1, 1 + br label %loop_body +loop_entry_2: + %i2 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ] + %j2 = add i32 %i2, 2 + br label %loop_body +loop_body: + %i3 = phi i32 [ %j1, %loop_entry_1 ], [ %j2, %loop_entry_2 ] + br label %loop_latch +loop_latch: + %div = sdiv i32 %tid, %i3 + switch i32 %div, label %branch [ i32 1, label %loop_entry_1 + i32 2, label %loop_entry_2 ] +branch: + %cmp = icmp eq i32 %i3, 5 + br i1 %cmp, label %then, label %else +; CHECK: DIVERGENT: br i1 %cmp, +then: + ret i32 0 +else: + ret i32 1 +} + +; Verifies sync-dependence is computed correctly in the absense of loops. +define i32 @sync_no_loop(i32 %arg) { +entry: + %0 = add i32 %arg, 1 + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %1 = icmp sge i32 %tid, 10 + br i1 %1, label %bb1, label %bb2 + +bb1: + br label %bb3 + +bb2: + br label %bb3 + +bb3: + %2 = add i32 %0, 2 + ; CHECK-NOT: DIVERGENT: %2 + ret i32 %2 +} + +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() +declare i32 @llvm.nvvm.read.ptx.sreg.laneid() + +!nvvm.annotations = !{!0, !1, !2, !3, !4, !5} +!0 = !{i32 (i32, i32, i32)* @no_diverge, !"kernel", i32 1} +!1 = !{i32 (i32, i32)* @sync, !"kernel", i32 1} +!2 = !{i32 (i32, i32, i32)* @mixed, !"kernel", i32 1} +!3 = !{i32 ()* @loop, !"kernel", i32 1} +!4 = !{i32 (i1)* @unstructured_loop, !"kernel", i32 1} +!5 = !{i32 (i32)* @sync_no_loop, !"kernel", i32 1} Index: llvm/trunk/test/Analysis/LegacyDivergenceAnalysis/NVPTX/lit.local.cfg =================================================================== --- llvm/trunk/test/Analysis/LegacyDivergenceAnalysis/NVPTX/lit.local.cfg +++ llvm/trunk/test/Analysis/LegacyDivergenceAnalysis/NVPTX/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'NVPTX' in config.root.targets: + config.unsupported = True