Index: include/llvm/Analysis/DivergenceAnalysis.h =================================================================== --- include/llvm/Analysis/DivergenceAnalysis.h +++ include/llvm/Analysis/DivergenceAnalysis.h @@ -173,6 +173,75 @@ std::vector Worklist; }; +/// \brief Divergence analysis frontend for GPU kernels. +class GPUDivergenceAnalysis { + SyncDependenceAnalysis SDA; + DivergenceAnalysis DA; + +public: + /// Runs the divergence analysis on @F, a GPU kernel + GPUDivergenceAnalysis(Function &F, const DominatorTree &DT, + const PostDominatorTree &PDT, const LoopInfo &LI, + const TargetTransformInfo &TTI); + + /// Whether any divergence was detected. + bool hasDivergence() const { return DA.hasDetectedDivergence(); } + + /// The GPU kernel this analysis result is for + const Function &getFunction() const { return DA.getFunction(); } + + /// Whether \p V is divergent. + bool isDivergent(const Value &V) const; + + /// Whether \p V is uniform/non-divergent + bool isUniform(const Value &V) const { return !isDivergent(V); } + + /// Print all divergent values in the kernel. + void print(raw_ostream &OS, const Module *) const; +}; + +/// \brief Divergence analysis frontend for loops. +class LoopDivergenceAnalysis { +public: + LoopDivergenceAnalysis(const DominatorTree &DT, const LoopInfo &LI, + SyncDependenceAnalysis &SDA, const Loop &loop); + + /// Whether \p V is divergent. + bool isDivergent(const Value &V) const; + + /// Whether \p V is uniform/non-divergent. + bool isUniform(const Value &V) const { return !isDivergent(V); } + + /// Print all divergent values in the loop. + void print(raw_ostream &OS, const Module *) const; + +private: + DivergenceAnalysis DA; +}; + +/// \brief Loop divergence printer pass. +/// This is intended for use in LIT testing. +class LoopDivergencePrinter : public FunctionPass { +public: + static char ID; + + LoopDivergencePrinter() : FunctionPass(ID) { + initializeLoopDivergencePrinterPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override; + + /// Analyze all loop-divergence of all loops in @F and print the results. + bool runOnFunction(Function &F) override; + + /// Print all divergent values in the function. + void print(raw_ostream &OS, const Module *) const override; + +private: + std::unique_ptr SDA; + SmallVector, 6> LoopDivInfo; +}; + } // namespace llvm #endif // LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H Index: include/llvm/Analysis/LegacyDivergenceAnalysis.h =================================================================== --- include/llvm/Analysis/LegacyDivergenceAnalysis.h +++ include/llvm/Analysis/LegacyDivergenceAnalysis.h @@ -19,9 +19,11 @@ #include "llvm/ADT/DenseSet.h" #include "llvm/IR/Function.h" #include "llvm/Pass.h" +#include "llvm/Analysis/DivergenceAnalysis.h" namespace llvm { class Value; +class GPUDivergenceAnalysis; class LegacyDivergenceAnalysis : public FunctionPass { public: static char ID; @@ -41,7 +43,7 @@ // // Even if this function returns false, V may still be divergent when used // in a different basic block. - bool isDivergent(const Value *V) const { return DivergentValues.count(V); } + bool isDivergent(const Value *V) const; // Returns true if V is uniform/non-divergent. // @@ -53,6 +55,12 @@ void removeValue(const Value *V) { DivergentValues.erase(V); } private: + // Whether analysis should be performed by GPUDivergenceAnalysis. + bool shouldUseGPUDivergenceAnalysis(const Function &F) const; + + // (optional) handle to new DivergenceAnalysis + std::unique_ptr gpuDA; + // Stores all divergent values. DenseSet DivergentValues; }; Index: include/llvm/Analysis/Passes.h =================================================================== --- include/llvm/Analysis/Passes.h +++ include/llvm/Analysis/Passes.h @@ -66,6 +66,13 @@ // FunctionPass *createLegacyDivergenceAnalysisPass(); + //===--------------------------------------------------------------------===// + // + // createLoopDivergencePrinterPass - This pass determines which branches and + // instructions in a loop are divergent. + // + FunctionPass *createLoopDivergencePrinterPass(); + //===--------------------------------------------------------------------===// // // Minor pass prototypes, allowing us to expose them through bugpoint and Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -213,6 +213,7 @@ void initializeLoopDataPrefetchLegacyPassPass(PassRegistry&); void initializeLoopDeletionLegacyPassPass(PassRegistry&); void initializeLoopDistributeLegacyPass(PassRegistry&); +void initializeLoopDivergencePrinterPass(PassRegistry&); void initializeLoopExtractorPass(PassRegistry&); void initializeLoopGuardWideningLegacyPassPass(PassRegistry&); void initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry&); Index: include/llvm/LinkAllPasses.h =================================================================== --- include/llvm/LinkAllPasses.h +++ include/llvm/LinkAllPasses.h @@ -123,6 +123,7 @@ (void) llvm::createLCSSAPass(); (void) llvm::createLegacyDivergenceAnalysisPass(); (void) llvm::createLICMPass(); + (void) llvm::createLoopDivergencePrinterPass(); (void) llvm::createLoopSinkPass(); (void) llvm::createLazyValueInfoPass(); (void) llvm::createLoopExtractorPass(); Index: lib/Analysis/Analysis.cpp =================================================================== --- lib/Analysis/Analysis.cpp +++ lib/Analysis/Analysis.cpp @@ -59,6 +59,7 @@ initializeLazyValueInfoPrinterPass(Registry); initializeLegacyDivergenceAnalysisPass(Registry); initializeLintPass(Registry); + initializeLoopDivergencePrinterPass(Registry); initializeLoopInfoWrapperPassPass(Registry); initializeMemDepPrinterPass(Registry); initializeMemDerefPrinterPass(Registry); Index: lib/Analysis/DivergenceAnalysis.cpp =================================================================== --- lib/Analysis/DivergenceAnalysis.cpp +++ lib/Analysis/DivergenceAnalysis.cpp @@ -422,3 +422,112 @@ OS << "DIVERGENT:" << I << '\n'; } } + +// class GPUDivergenceAnalysis +GPUDivergenceAnalysis::GPUDivergenceAnalysis(Function &F, + const DominatorTree &DT, + const PostDominatorTree &PDT, + const LoopInfo &LI, + const TargetTransformInfo &TTI) + : SDA(DT, PDT, LI), DA(F, nullptr, DT, LI, SDA, false) { + for (auto &I : instructions(F)) { + if (TTI.isSourceOfDivergence(&I)) { + DA.markDivergent(I); + } else if (TTI.isAlwaysUniform(&I)) { + DA.addUniformOverride(I); + } + } + for (auto &Arg : F.args()) { + if (TTI.isSourceOfDivergence(&Arg)) { + DA.markDivergent(Arg); + } + } + + DA.compute(); +} + +bool GPUDivergenceAnalysis::isDivergent(const Value &val) const { + return DA.isDivergent(val); +} + +void GPUDivergenceAnalysis::print(raw_ostream &OS, const Module *mod) const { + OS << "Divergence of kernel " << DA.getFunction().getName() << " {\n"; + DA.print(OS, mod); + OS << "}\n"; +} + +// class LoopDivergenceAnalysis +LoopDivergenceAnalysis::LoopDivergenceAnalysis(const DominatorTree &DT, + const LoopInfo &LI, + SyncDependenceAnalysis &SDA, + const Loop &Loop) + : DA(*Loop.getHeader()->getParent(), &Loop, DT, LI, SDA, true) { + for (const auto &Phi : Loop.getHeader()->phis()) { + DA.markDivergent(Phi); + } + + // after the scalar remainder loop is extracted, the loop exit condition will + // be uniform + auto LoopExitingInst = Loop.getExitingBlock()->getTerminator(); + auto LoopExitCond = cast(LoopExitingInst)->getCondition(); + DA.addUniformOverride(*LoopExitCond); + + DA.compute(); +} + +bool LoopDivergenceAnalysis::isDivergent(const Value &V) const { + return DA.isDivergent(V); +} + +void LoopDivergenceAnalysis::print(raw_ostream &OS, const Module *Mod) const { + OS << "Divergence of loop " << DA.getRegionLoop()->getName() << " {\n"; + DA.print(OS, Mod); + OS << "}\n"; +} + +// class LoopDivergencePrinter +bool LoopDivergencePrinter::runOnFunction(Function &F) { + const PostDominatorTree &PDT = + getAnalysis().getPostDomTree(); + const DominatorTree &DT = + getAnalysis().getDomTree(); + const LoopInfo &LI = getAnalysis().getLoopInfo(); + SDA = make_unique(DT, PDT, LI); + + for (auto &BB : F) { + auto *Loop = LI.getLoopFor(&BB); + if (!Loop || Loop->getHeader() != &BB) + continue; + LoopDivInfo.push_back( + make_unique(DT, LI, *SDA, *Loop)); + } + + return false; +} + +void LoopDivergencePrinter::print(raw_ostream &OS, const Module *Mod) const { + for (auto &DivInfo : LoopDivInfo) { + DivInfo->print(OS, Mod); + } +} + +// Register this pass. +char LoopDivergencePrinter::ID = 0; +INITIALIZE_PASS_BEGIN(LoopDivergencePrinter, "loop-divergence", + "Loop Divergence Printer", false, true) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_END(LoopDivergencePrinter, "loop-divergence", + "Loop Divergence Printer", false, true) + +FunctionPass *llvm::createLoopDivergencePrinterPass() { + return new LoopDivergencePrinter(); +} + +void LoopDivergencePrinter::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.setPreservesAll(); +} Index: lib/Analysis/LegacyDivergenceAnalysis.cpp =================================================================== --- lib/Analysis/LegacyDivergenceAnalysis.cpp +++ lib/Analysis/LegacyDivergenceAnalysis.cpp @@ -1,4 +1,5 @@ -//===- LegacyDivergenceAnalysis.cpp --------- Legacy Divergence Analysis Implementation -==// +//===- LegacyDivergenceAnalysis.cpp --------- Legacy Divergence Analysis +//Implementation -==// // // The LLVM Compiler Infrastructure // @@ -64,6 +65,9 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/Passes.h" #include "llvm/Analysis/PostDominators.h" @@ -79,6 +83,12 @@ #define DEBUG_TYPE "divergence" +// transparently use the GPUDivergenceAnalysis +static cl::opt UseGPUDA("use-gpu-divergence-analysis", cl::init(false), + cl::Hidden, + cl::desc("turn the LegacyDivergenceAnalysis into " + "a wrapper for GPUDivergenceAnalysis")); + namespace { class DivergencePropagator { @@ -262,16 +272,17 @@ } } -} /// end namespace anonymous +} // namespace // Register this pass. char LegacyDivergenceAnalysis::ID = 0; -INITIALIZE_PASS_BEGIN(LegacyDivergenceAnalysis, "divergence", "Legacy Divergence Analysis", - false, true) +INITIALIZE_PASS_BEGIN(LegacyDivergenceAnalysis, "divergence", + "Legacy Divergence Analysis", false, true) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) -INITIALIZE_PASS_END(LegacyDivergenceAnalysis, "divergence", "Legacy Divergence Analysis", - false, true) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_END(LegacyDivergenceAnalysis, "divergence", + "Legacy Divergence Analysis", false, true) FunctionPass *llvm::createLegacyDivergenceAnalysisPass() { return new LegacyDivergenceAnalysis(); @@ -280,9 +291,24 @@ void LegacyDivergenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.addRequired(); + if (UseGPUDA) + AU.addRequired(); AU.setPreservesAll(); } +bool LegacyDivergenceAnalysis::shouldUseGPUDivergenceAnalysis( + const Function &F) const { + if (!UseGPUDA) + return false; + + // GPUDivergenceAnalysis requires a reducible CFG. + auto &LI = getAnalysis().getLoopInfo(); + using RPOTraversal = ReversePostOrderTraversal; + RPOTraversal FuncRPOT(&F); + return !containsIrreducibleCFG(FuncRPOT, LI); +} + bool LegacyDivergenceAnalysis::runOnFunction(Function &F) { auto *TTIWP = getAnalysisIfAvailable(); if (TTIWP == nullptr) @@ -295,36 +321,59 @@ return false; DivergentValues.clear(); + gpuDA = nullptr; + + auto &DT = getAnalysis().getDomTree(); auto &PDT = getAnalysis().getPostDomTree(); - DivergencePropagator DP(F, TTI, - getAnalysis().getDomTree(), - PDT, DivergentValues); - DP.populateWithSourcesOfDivergence(); - DP.propagate(); - LLVM_DEBUG( - dbgs() << "\nAfter divergence analysis on " << F.getName() << ":\n"; - print(dbgs(), F.getParent()) - ); + + if (shouldUseGPUDivergenceAnalysis(F)) { + // run the new GPU divergence analysis + auto &LI = getAnalysis().getLoopInfo(); + gpuDA = llvm::make_unique(F, DT, PDT, LI, TTI); + + } else { + // run LLVM's existing DivergenceAnalysis + DivergencePropagator DP(F, TTI, DT, PDT, DivergentValues); + DP.populateWithSourcesOfDivergence(); + DP.propagate(); + } + + LLVM_DEBUG(dbgs() << "\nAfter divergence analysis on " << F.getName() + << ":\n"; + print(dbgs(), F.getParent())); + return false; } +bool LegacyDivergenceAnalysis::isDivergent(const Value *V) const { + if (gpuDA) { + return gpuDA->isDivergent(*V); + } + return DivergentValues.count(V); +} + void LegacyDivergenceAnalysis::print(raw_ostream &OS, const Module *) const { - if (DivergentValues.empty()) + if ((!gpuDA || !gpuDA->hasDivergence()) && DivergentValues.empty()) return; - const Value *FirstDivergentValue = *DivergentValues.begin(); + const Function *F; - if (const Argument *Arg = dyn_cast(FirstDivergentValue)) { - F = Arg->getParent(); - } else if (const Instruction *I = - dyn_cast(FirstDivergentValue)) { - F = I->getParent()->getParent(); - } else { - llvm_unreachable("Only arguments and instructions can be divergent"); + if (!DivergentValues.empty()) { + const Value *FirstDivergentValue = *DivergentValues.begin(); + if (const Argument *Arg = dyn_cast(FirstDivergentValue)) { + F = Arg->getParent(); + } else if (const Instruction *I = + dyn_cast(FirstDivergentValue)) { + F = I->getParent()->getParent(); + } else { + llvm_unreachable("Only arguments and instructions can be divergent"); + } + } else if (gpuDA) { + F = &gpuDA->getFunction(); } // Dumps all divergent values in F, arguments and then instructions. for (auto &Arg : F->args()) { - OS << (DivergentValues.count(&Arg) ? "DIVERGENT: " : " "); + OS << (isDivergent(&Arg) ? "DIVERGENT: " : " "); OS << Arg << "\n"; } // Iterate instructions using instructions() to ensure a deterministic order. @@ -332,7 +381,7 @@ auto &BB = *BI; OS << "\n " << BB.getName() << ":\n"; for (auto &I : BB.instructionsWithoutDebug()) { - OS << (DivergentValues.count(&I) ? "DIVERGENT: " : " "); + OS << (isDivergent(&I) ? "DIVERGENT: " : " "); OS << I << "\n"; } } Index: test/Analysis/DivergenceAnalysis/AMDGPU/always_uniform.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/always_uniform.ll @@ -0,0 +1,14 @@ +; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +define amdgpu_kernel void @workitem_id_x() #1 { + %id.x = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK: DIVERGENT: %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %first.lane = call i32 @llvm.amdgcn.readfirstlane(i32 %id.x) +; CHECK-NOT: DIVERGENT: %first.lane = call i32 @llvm.amdgcn.readfirstlane(i32 %id.x) + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 +declare i32 @llvm.amdgcn.readfirstlane(i32) #0 + +attributes #0 = { nounwind readnone } Index: test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll @@ -0,0 +1,45 @@ +; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +; CHECK: DIVERGENT: %orig = atomicrmw xchg i32* %ptr, i32 %val seq_cst +define i32 @test1(i32* %ptr, i32 %val) #0 { + %orig = atomicrmw xchg i32* %ptr, i32 %val seq_cst + ret i32 %orig +} + +; CHECK: DIVERGENT: %orig = cmpxchg i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst +define {i32, i1} @test2(i32* %ptr, i32 %cmp, i32 %new) { + %orig = cmpxchg i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst + ret {i32, i1} %orig +} + +; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false) +define i32 @test_atomic_inc_i32(i32 addrspace(1)* %ptr, i32 %val) #0 { + %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false) + ret i32 %ret +} + +; CHECK: DIVERGENT: %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false) +define i64 @test_atomic_inc_i64(i64 addrspace(1)* %ptr, i64 %val) #0 { + %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false) + ret i64 %ret +} + +; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false) +define i32 @test_atomic_dec_i32(i32 addrspace(1)* %ptr, i32 %val) #0 { + %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false) + ret i32 %ret +} + +; CHECK: DIVERGENT: %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false) +define i64 @test_atomic_dec_i64(i64 addrspace(1)* %ptr, i64 %val) #0 { + %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false) + ret i64 %ret +} + +declare i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32, i32, i1) #1 +declare i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32, i32, i1) #1 +declare i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32, i32, i1) #1 +declare i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32, i32, i1) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind argmemonly } Index: test/Analysis/DivergenceAnalysis/AMDGPU/hidden_diverge.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/hidden_diverge.ll @@ -0,0 +1,26 @@ +; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +define amdgpu_kernel void @hidden_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_diverge' +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %cond.var = icmp slt i32 %tid, 0 + br i1 %cond.var, label %B, label %C ; divergent +; CHECK: DIVERGENT: br i1 %cond.var, +B: + %cond.uni = icmp slt i32 %n, 0 + br i1 %cond.uni, label %C, label %merge ; uniform +; CHECK-NOT: DIVERGENT: br i1 %cond.uni, +C: + %phi.var.hidden = phi i32 [ 1, %entry ], [ 2, %B ] +; CHECK: DIVERGENT: %phi.var.hidden = phi i32 + br label %merge +merge: + %phi.ipd = phi i32 [ %a, %B ], [ %b, %C ] +; CHECK: DIVERGENT: %phi.ipd = phi i32 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } Index: test/Analysis/DivergenceAnalysis/AMDGPU/hidden_loopdiverge.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/hidden_loopdiverge.ll @@ -0,0 +1,223 @@ +; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +; divergent loop (H
, B) +; the divergent join point in %exit is obscured by uniform control joining in %X +define amdgpu_kernel void @hidden_loop_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_loop_diverge': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br i1 %uni.cond, label %X, label %H ; uniform + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %B ] + %div.exitx = icmp slt i32 %tid, 0 + br i1 %div.exitx, label %X, label %B ; divergent branch +; CHECK: DIVERGENT: %div.exitx = +; CHECK: DIVERGENT: br i1 %div.exitx, + +B: + %uni.inc = add i32 %uni.merge.h, 1 + %div.exity = icmp sgt i32 %tid, 0 + br i1 %div.exity, label %Y, label %H ; divergent branch +; CHECK: DIVERGENT: %div.exity = +; CHECK: DIVERGENT: br i1 %div.exity, + +X: + %div.merge.x = phi i32 [ %a, %entry ], [ %uni.merge.h, %H ] ; temporal divergent phi + br i1 %uni.cond, label %Y, label %exit +; CHECK: DIVERGENT: %div.merge.x = + +Y: + %div.merge.y = phi i32 [ 42, %X ], [ %b, %B ] + br label %exit +; CHECK: DIVERGENT: %div.merge.y = + +exit: + %div.merge.exit = phi i32 [ %a, %X ], [ %b, %Y ] + ret void +; CHECK: DIVERGENT: %div.merge.exit = +} + +; divergent loop (H
, B) +; the phi nodes in X and Y don't actually receive divergent values +define amdgpu_kernel void @unobserved_loop_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'unobserved_loop_diverge': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br i1 %uni.cond, label %X, label %H ; uniform + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %B ] + %div.exitx = icmp slt i32 %tid, 0 + br i1 %div.exitx, label %X, label %B ; divergent branch +; CHECK: DIVERGENT: %div.exitx = +; CHECK: DIVERGENT: br i1 %div.exitx, + +B: + %uni.inc = add i32 %uni.merge.h, 1 + %div.exity = icmp sgt i32 %tid, 0 + br i1 %div.exity, label %Y, label %H ; divergent branch +; CHECK: DIVERGENT: %div.exity = +; CHECK: DIVERGENT: br i1 %div.exity, + +X: + %uni.merge.x = phi i32 [ %a, %entry ], [ %b, %H ] + br label %exit + +Y: + %uni.merge.y = phi i32 [ %b, %B ] + br label %exit + +exit: + %div.merge.exit = phi i32 [ %a, %X ], [ %b, %Y ] + ret void +; CHECK: DIVERGENT: %div.merge.exit = +} + +; divergent loop (G
, L) inside divergent loop (H
, B, C, D, G, L) +; the inner loop has no exit to top level. +; the outer loop becomes divergent as its exiting branch in C is control-dependent on the inner loop's divergent loop exit in D. +define amdgpu_kernel void @hidden_nestedloop_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_nestedloop_diverge': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %uni.cond, label %X, label %H + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %D ] + br i1 %uni.cond, label %G, label %B +; CHECK: DIVERGENT: %div.exitx = + +B: + br i1 %uni.cond, label %X, label %C + +C: + br i1 %uni.cond, label %Y, label %D + +D: + %uni.inc = add i32 %uni.merge.h, 1 + br label %H + +G: + br i1 %div.exitx, label %C, label %L +; CHECK: DIVERGENT: br i1 %div.exitx, + +L: + br i1 %uni.cond, label %D, label %G + +X: + %div.merge.x = phi i32 [ %a, %entry ], [ %uni.merge.h, %B ] ; temporal divergent phi + br i1 %uni.cond, label %Y, label %exit +; CHECK: DIVERGENT: %div.merge.x = + +Y: + %div.merge.y = phi i32 [ 42, %X ], [ %b, %C ] + br label %exit +; CHECK: DIVERGENT: %div.merge.y = + +exit: + %div.merge.exit = phi i32 [ %a, %X ], [ %b, %Y ] + ret void +; CHECK: DIVERGENT: %div.merge.exit = +} + +; divergent loop (G
, L) in divergent loop (H
, B, C, G, L) +; the outer loop has no immediately divergent exiting edge. +; the inner exiting edge is exiting to top-level through the outer loop causing both to become divergent. +define amdgpu_kernel void @hidden_doublebreak_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_doublebreak_diverge': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %uni.cond, label %X, label %H + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %C ] + br i1 %uni.cond, label %G, label %B +; CHECK: DIVERGENT: %div.exitx = + +B: + br i1 %uni.cond, label %Y, label %C + +C: + %uni.inc = add i32 %uni.merge.h, 1 + br label %H + +G: + br i1 %div.exitx, label %X, label %L ; two-level break +; CHECK: DIVERGENT: br i1 %div.exitx, + +L: + br i1 %uni.cond, label %C, label %G + +X: + %div.merge.x = phi i32 [ %a, %entry ], [ %uni.merge.h, %G ] ; temporal divergence + br label %Y +; CHECK: DIVERGENT: %div.merge.x = + +Y: + %div.merge.y = phi i32 [ 42, %X ], [ %b, %B ] + ret void +; CHECK: DIVERGENT: %div.merge.y = +} + +; divergent loop (G
, L) contained inside a uniform loop (H
, B, G, L , D) +define amdgpu_kernel void @hidden_containedloop_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_containedloop_diverge': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %uni.cond, label %X, label %H + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc.d, %D ] + br i1 %uni.cond, label %G, label %B +; CHECK: DIVERGENT: %div.exitx = + +B: + %div.merge.b = phi i32 [ 42, %H ], [ %uni.merge.g, %G ] + br label %D +; CHECK: DIVERGENT: %div.merge.b = + +G: + %uni.merge.g = phi i32 [ 123, %H ], [ %uni.inc.l, %L ] + br i1 %div.exitx, label %B, label %L +; CHECK: DIVERGENT: br i1 %div.exitx, + +L: + %uni.inc.l = add i32 %uni.merge.g, 1 + br i1 %uni.cond, label %G, label %D + +D: + %uni.inc.d = add i32 %uni.merge.h, 1 + br i1 %uni.cond, label %X, label %H + +X: + %uni.merge.x = phi i32 [ %a, %entry ], [ %uni.inc.d, %D ] + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } Index: test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll @@ -0,0 +1,13 @@ +; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +; CHECK: DIVERGENT: %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0 +define amdgpu_kernel void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) #0 { + %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0 + store i32 %swizzle, i32 addrspace(1)* %out, align 4 + ret void +} + +declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1 + +attributes #0 = { nounwind convergent } +attributes #1 = { nounwind readnone convergent } Index: test/Analysis/DivergenceAnalysis/AMDGPU/irreducible.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/irreducible.ll @@ -0,0 +1,48 @@ +; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +; This test contains an unstructured loop. +; +-------------- entry ----------------+ +; | | +; V V +; i1 = phi(0, i3) i2 = phi(0, i3) +; j1 = i1 + 1 ---> i3 = phi(j1, j2) <--- j2 = i2 + 2 +; ^ | ^ +; | V | +; +-------- switch (tid / i3) ----------+ +; | +; V +; if (i3 == 5) // divergent +; because sync dependent on (tid / i3). +define i32 @unstructured_loop(i1 %entry_cond) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'unstructured_loop' +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + br i1 %entry_cond, label %loop_entry_1, label %loop_entry_2 +loop_entry_1: + %i1 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ] + %j1 = add i32 %i1, 1 + br label %loop_body +loop_entry_2: + %i2 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ] + %j2 = add i32 %i2, 2 + br label %loop_body +loop_body: + %i3 = phi i32 [ %j1, %loop_entry_1 ], [ %j2, %loop_entry_2 ] + br label %loop_latch +loop_latch: + %div = sdiv i32 %tid, %i3 + switch i32 %div, label %branch [ i32 1, label %loop_entry_1 + i32 2, label %loop_entry_2 ] +branch: + %cmp = icmp eq i32 %i3, 5 + br i1 %cmp, label %then, label %else +; CHECK: DIVERGENT: br i1 %cmp, +then: + ret i32 0 +else: + ret i32 1 +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } Index: test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll @@ -0,0 +1,41 @@ +; RUN: opt %s -mtriple amdgcn-- -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s + +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'test_amdgpu_ps': +; CHECK: DIVERGENT: +; CHECK-NOT: %arg0 +; CHECK-NOT: %arg1 +; CHECK-NOT: %arg2 +; CHECK: <2 x i32> %arg3 +; CHECK: DIVERGENT: <3 x i32> %arg4 +; CHECK: DIVERGENT: float %arg5 +; CHECK: DIVERGENT: i32 %arg6 + +define amdgpu_ps void @test_amdgpu_ps([4 x <16 x i8>] addrspace(2)* byval %arg0, float inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <3 x i32> %arg4, float %arg5, i32 %arg6) #0 { + ret void +} + +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'test_amdgpu_kernel': +; CHECK-NOT: %arg0 +; CHECK-NOT: %arg1 +; CHECK-NOT: %arg2 +; CHECK-NOT: %arg3 +; CHECK-NOT: %arg4 +; CHECK-NOT: %arg5 +; CHECK-NOT: %arg6 +define amdgpu_kernel void @test_amdgpu_kernel([4 x <16 x i8>] addrspace(2)* byval %arg0, float inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <3 x i32> %arg4, float %arg5, i32 %arg6) #0 { + ret void +} + +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'test_c': +; CHECK: DIVERGENT: +; CHECK: DIVERGENT: +; CHECK: DIVERGENT: +; CHECK: DIVERGENT: +; CHECK: DIVERGENT: +; CHECK: DIVERGENT: +; CHECK: DIVERGENT: +define void @test_c([4 x <16 x i8>] addrspace(2)* byval %arg0, float inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <3 x i32> %arg4, float %arg5, i32 %arg6) #0 { + ret void +} + +attributes #0 = { nounwind } Index: test/Analysis/DivergenceAnalysis/AMDGPU/lit.local.cfg =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'AMDGPU' in config.root.targets: + config.unsupported = True Index: test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll @@ -0,0 +1,103 @@ +;RUN: opt -mtriple=amdgcn-mesa-mesa3d -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.swap( +define float @buffer_atomic_swap(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.add( +define float @buffer_atomic_add(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.add(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.sub( +define float @buffer_atomic_sub(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.sub(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.smin( +define float @buffer_atomic_smin(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.smin(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.umin( +define float @buffer_atomic_umin(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.umin(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.smax( +define float @buffer_atomic_smax(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.smax(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.umax( +define float @buffer_atomic_umax(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.umax(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.and( +define float @buffer_atomic_and(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.and(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.or( +define float @buffer_atomic_or(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.or(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.xor( +define float @buffer_atomic_xor(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.xor(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.cmpswap( +define float @buffer_atomic_cmpswap(<4 x i32> inreg %rsrc, i32 inreg %data, i32 inreg %cmp) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +declare i32 @llvm.amdgcn.buffer.atomic.swap(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.add(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.smin(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.umin(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.smax(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.umax(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.and(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.or(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.xor(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i1) #0 + +attributes #0 = { nounwind } Index: test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll @@ -0,0 +1,131 @@ +;RUN: opt -mtriple=amdgcn-mesa-mesa3d -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32( +define float @image_atomic_swap(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32( +define float @image_atomic_add(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32( +define float @image_atomic_sub(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32( +define float @image_atomic_smin(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32( +define float @image_atomic_umin(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32( +define float @image_atomic_smax(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32( +define float @image_atomic_umax(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32( +define float @image_atomic_and(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32( +define float @image_atomic_or(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32( +define float @image_atomic_xor(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32( +define float @image_atomic_inc(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32( +define float @image_atomic_dec(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32( +define float @image_atomic_cmpswap(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data, i32 inreg %cmp) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %data, i32 %cmp, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32( +define float @image_atomic_add_2d(<8 x i32> inreg %rsrc, i32 inreg %s, i32 inreg %t, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +declare i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #0 + +declare i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #0 + +attributes #0 = { nounwind } Index: test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll @@ -0,0 +1,30 @@ +; RUN: opt %s -mtriple amdgcn-- -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s + +; CHECK: DIVERGENT: %tmp5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp2 +; CHECK: DIVERGENT: %tmp10 = load volatile float, float addrspace(1)* %tmp5, align 4 +; CHECK: DIVERGENT: %tmp11 = load volatile float, float addrspace(1)* %tmp5, align 4 + +; The post dominator tree does not have a root node in this case +define amdgpu_kernel void @no_return_blocks(float addrspace(1)* noalias nocapture readonly %arg, float addrspace(1)* noalias nocapture readonly %arg1) #0 { +bb0: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %tmp2 = sext i32 %tmp to i64 + %tmp5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp2 + %tmp6 = load volatile float, float addrspace(1)* %tmp5, align 4 + %tmp8 = fcmp olt float %tmp6, 0.000000e+00 + br i1 %tmp8, label %bb1, label %bb2 + +bb1: + %tmp10 = load volatile float, float addrspace(1)* %tmp5, align 4 + br label %bb2 + +bb2: + %tmp11 = load volatile float, float addrspace(1)* %tmp5, align 4 + br label %bb1 +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } Index: test/Analysis/DivergenceAnalysis/AMDGPU/phi-undef.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/phi-undef.ll @@ -0,0 +1,31 @@ +; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +; CHECK-LABEL: 'test1': +; CHECK-NEXT: DIVERGENT: i32 %bound +; CHECK: {{^ *}}%counter = +; CHECK-NEXT: DIVERGENT: %break = icmp sge i32 %counter, %bound +; CHECK-NEXT: DIVERGENT: br i1 %break, label %footer, label %body +; CHECK: {{^ *}}%counter.next = +; CHECK: {{^ *}}%counter.footer = +; CHECK: DIVERGENT: br i1 %break, label %end, label %header +; Note: %counter is not divergent! +define amdgpu_ps void @test1(i32 %bound) { +entry: + br label %header + +header: + %counter = phi i32 [ 0, %entry ], [ %counter.footer, %footer ] + %break = icmp sge i32 %counter, %bound + br i1 %break, label %footer, label %body + +body: + %counter.next = add i32 %counter, 1 + br label %footer + +footer: + %counter.footer = phi i32 [ %counter.next, %body ], [ undef, %header ] + br i1 %break, label %end, label %header + +end: + ret void +} Index: test/Analysis/DivergenceAnalysis/AMDGPU/temporal_diverge.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/temporal_diverge.ll @@ -0,0 +1,154 @@ +; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +; temporal-divergent use of value carried by divergent loop +define amdgpu_kernel void @temporal_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'temporal_diverge': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br label %H + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %H ] + %uni.inc = add i32 %uni.merge.h, 1 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %div.exitx, label %X, label %H ; divergent branch +; CHECK: DIVERGENT: %div.exitx = +; CHECK: DIVERGENT: br i1 %div.exitx, + +X: + %div.user = add i32 %uni.inc, 5 + ret void +} + +; temporal-divergent use of value carried by divergent loop inside a top-level loop +define amdgpu_kernel void @temporal_diverge_inloop(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'temporal_diverge_inloop': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br label %G + +G: + br label %H + +H: + %uni.merge.h = phi i32 [ 0, %G ], [ %uni.inc, %H ] + %uni.inc = add i32 %uni.merge.h, 1 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %div.exitx, label %X, label %H ; divergent branch +; CHECK: DIVERGENT: %div.exitx = +; CHECK: DIVERGENT: br i1 %div.exitx, + +X: + %div.user = add i32 %uni.inc, 5 + br i1 %uni.cond, label %G, label %Y + +Y: + %div.alsouser = add i32 %uni.inc, 5 + ret void +} + + +; temporal-uniform use of a valud, definition and users are carried by a surrounding divergent loop +define amdgpu_kernel void @temporal_uniform_indivloop(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'temporal_uniform_indivloop': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br label %G + +G: + br label %H + +H: + %uni.merge.h = phi i32 [ 0, %G ], [ %uni.inc, %H ] + %uni.inc = add i32 %uni.merge.h, 1 + br i1 %uni.cond, label %X, label %H ; divergent branch + +X: + %uni.user = add i32 %uni.inc, 5 + %div.exity = icmp slt i32 %tid, 0 +; CHECK: DIVERGENT: %div.exity = + br i1 %div.exity, label %G, label %Y +; CHECK: DIVERGENT: br i1 %div.exity, + +Y: + %div.alsouser = add i32 %uni.inc, 5 + ret void +} + + +; temporal-divergent use of value carried by divergent loop, user is inside sibling loop +define amdgpu_kernel void @temporal_diverge_loopuser(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'temporal_diverge_loopuser': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br label %H + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %H ] + %uni.inc = add i32 %uni.merge.h, 1 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %div.exitx, label %X, label %H ; divergent branch +; CHECK: DIVERGENT: %div.exitx = +; CHECK: DIVERGENT: br i1 %div.exitx, + +X: + br label %G + +G: + %div.user = add i32 %uni.inc, 5 + br i1 %uni.cond, label %G, label %Y + +Y: + ret void +} + +; temporal-divergent use of value carried by divergent loop, user is inside sibling loop, defs and use are carried by a uniform loop +define amdgpu_kernel void @temporal_diverge_loopuser_nested(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'temporal_diverge_loopuser_nested': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br label %H + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %H ] + %uni.inc = add i32 %uni.merge.h, 1 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %div.exitx, label %X, label %H ; divergent branch +; CHECK: DIVERGENT: %div.exitx = +; CHECK: DIVERGENT: br i1 %div.exitx, + +X: + br label %G + +G: + %div.user = add i32 %uni.inc, 5 + br i1 %uni.cond, label %G, label %Y + +Y: + ret void +} + + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } Index: test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll @@ -0,0 +1,45 @@ +; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +declare i32 @llvm.amdgcn.workitem.id.x() #0 +declare i32 @llvm.amdgcn.workitem.id.y() #0 +declare i32 @llvm.amdgcn.workitem.id.z() #0 +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0 +declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0 + +; CHECK: DIVERGENT: %id.x = call i32 @llvm.amdgcn.workitem.id.x() +define amdgpu_kernel void @workitem_id_x() #1 { + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %id.x, i32 addrspace(1)* undef + ret void +} + +; CHECK: DIVERGENT: %id.y = call i32 @llvm.amdgcn.workitem.id.y() +define amdgpu_kernel void @workitem_id_y() #1 { + %id.y = call i32 @llvm.amdgcn.workitem.id.y() + store volatile i32 %id.y, i32 addrspace(1)* undef + ret void +} + +; CHECK: DIVERGENT: %id.z = call i32 @llvm.amdgcn.workitem.id.z() +define amdgpu_kernel void @workitem_id_z() #1 { + %id.z = call i32 @llvm.amdgcn.workitem.id.z() + store volatile i32 %id.z, i32 addrspace(1)* undef + ret void +} + +; CHECK: DIVERGENT: %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 0, i32 0) +define amdgpu_kernel void @mbcnt_lo() #1 { + %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 0, i32 0) + store volatile i32 %mbcnt.lo, i32 addrspace(1)* undef + ret void +} + +; CHECK: DIVERGENT: %mbcnt.hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0) +define amdgpu_kernel void @mbcnt_hi() #1 { + %mbcnt.hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0) + store volatile i32 %mbcnt.hi, i32 addrspace(1)* undef + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } Index: test/Analysis/DivergenceAnalysis/Loops/IndirectUniAccess.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/Loops/IndirectUniAccess.ll @@ -0,0 +1,74 @@ +; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s + +; CHECK: Divergence of loop for.body { +; CHECK-NEXT: DIVERGENT: %indvars.iv29 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next30, %for.cond.cleanup3 ] +; CHECK-NEXT: DIVERGENT: %x.0.lcssa = phi double [ 0.000000e+00, %for.body ], [ %add, %for.body4 ] +; CHECK-NEXT: DIVERGENT: %arrayidx10 = getelementptr inbounds double, double* %C, i64 %indvars.iv29 +; CHECK-NEXT: DIVERGENT: store double %x.0.lcssa, double* %arrayidx10, align 8 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next30 = add nuw nsw i64 %indvars.iv29, 1 +; CHECK-NEXT: DIVERGENT: %x.025 = phi double [ %add, %for.body4 ], [ 0.000000e+00, %for.body4.preheader ] +; CHECK-NEXT: DIVERGENT: %arrayidx8 = getelementptr inbounds double, double* %1, i64 %indvars.iv29 +; CHECK-NEXT: DIVERGENT: %2 = load double, double* %arrayidx8, align 8 +; CHECK-NEXT: DIVERGENT: %add = fadd double %x.025, %2 +; CHECK-NEXT: } +; CHECK: Divergence of loop for.body4 { +; CHECK-NEXT: DIVERGENT: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4 ], [ 0, %for.body4.preheader ] +; CHECK-NEXT: DIVERGENT: %x.025 = phi double [ %add, %for.body4 ], [ 0.000000e+00, %for.body4.preheader ] +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds i32, i32* %Index, i64 %indvars.iv +; CHECK-NEXT: DIVERGENT: %0 = load i32, i32* %arrayidx, align 4 +; CHECK-NEXT: DIVERGENT: %idxprom5 = sext i32 %0 to i64 +; CHECK-NEXT: DIVERGENT: %arrayidx6 = getelementptr inbounds double*, double** %A, i64 %idxprom5 +; CHECK-NEXT: DIVERGENT: %1 = load double*, double** %arrayidx6, align 8 +; CHECK-NEXT: DIVERGENT: %arrayidx8 = getelementptr inbounds double, double* %1, i64 %indvars.iv29 +; CHECK-NEXT: DIVERGENT: %2 = load double, double* %arrayidx8, align 8 +; CHECK-NEXT: DIVERGENT: %add = fadd double %x.025, %2 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-NEXT: } + +; Function Attrs: norecurse nounwind uwtable +define void @test(i32* nocapture readonly %Index, double** nocapture readonly %A, double* nocapture %C, i32 %m, i32 %n) #0 { +entry: + %cmp27 = icmp sgt i32 %n, 0 + br i1 %cmp27, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %cmp224 = icmp sgt i32 %m, 0 + %wide.trip.count = zext i32 %m to i64 + %wide.trip.count31 = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup3, %entry + ret void + +for.body: ; preds = %for.cond.cleanup3, %for.body.lr.ph + %indvars.iv29 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next30, %for.cond.cleanup3 ] + br i1 %cmp224, label %for.body4.preheader, label %for.cond.cleanup3 + +for.body4.preheader: ; preds = %for.body + br label %for.body4 + +for.cond.cleanup3: ; preds = %for.body4, %for.body + %x.0.lcssa = phi double [ 0.000000e+00, %for.body ], [ %add, %for.body4 ] + %arrayidx10 = getelementptr inbounds double, double* %C, i64 %indvars.iv29 + store double %x.0.lcssa, double* %arrayidx10, align 8 + %indvars.iv.next30 = add nuw nsw i64 %indvars.iv29, 1 + %exitcond32 = icmp eq i64 %indvars.iv.next30, %wide.trip.count31 + br i1 %exitcond32, label %for.cond.cleanup, label %for.body + +for.body4: ; preds = %for.body4.preheader, %for.body4 + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4 ], [ 0, %for.body4.preheader ] + %x.025 = phi double [ %add, %for.body4 ], [ 0.000000e+00, %for.body4.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %Index, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %idxprom5 = sext i32 %0 to i64 + %arrayidx6 = getelementptr inbounds double*, double** %A, i64 %idxprom5 + %1 = load double*, double** %arrayidx6, align 8 + %arrayidx8 = getelementptr inbounds double, double* %1, i64 %indvars.iv29 + %2 = load double, double* %arrayidx8, align 8 + %add = fadd double %x.025, %2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup3, label %for.body4 +} + +attributes #0 = { norecurse nounwind } Index: test/Analysis/DivergenceAnalysis/Loops/LoopWithDivBranch.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/Loops/LoopWithDivBranch.ll @@ -0,0 +1,44 @@ +; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s + +; CHECK: Divergence of loop for.body { +; CHECK-NEXT: DIVERGENT: %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %B ] +; CHECK-NEXT: DIVERGENT: %hfreq = srem i64 %indvars.iv, 2 +; CHECK-NEXT: DIVERGENT: %toggle = trunc i64 %hfreq to i1 +; CHECK-NEXT: DIVERGENT: br i1 %toggle, label %A, label %B +; CHECK-NEXT: DIVERGENT: %divphi = phi float [ %cast, %A ], [ 4.200000e+01, %for.body ] +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds float, float* %ptr, i64 %indvars.iv +; CHECK-NEXT: DIVERGENT: store float %divphi, float* %arrayidx, align 4 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-NEXT: } +define void @test1(float* nocapture %ptr, i64 %n) #0 { + entry: + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %B ] + %hfreq = srem i64 %indvars.iv, 2 + %toggle = trunc i64 %hfreq to i1 + br i1 %toggle, label %A, label %B + +A: + %trunc = trunc i64 %n to i32 + %cast = sitofp i32 %trunc to float + br label %B + +B: + %divphi = phi float [ %cast, %A ], [ 4.200000e+01, %for.body ] + %arrayidx = getelementptr inbounds float, float* %ptr, i64 %indvars.iv + store float %divphi, float* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +attributes #0 = { nounwind } Index: test/Analysis/DivergenceAnalysis/Loops/LoopWithDivLoop.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/Loops/LoopWithDivLoop.ll @@ -0,0 +1,60 @@ +; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s + +; CHECK: Printing analysis 'Loop Divergence Printer' for function 'test1': +; CHECK-NEXT: Divergence of loop for.body { +; CHECK-NEXT: DIVERGENT: %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.latch ] +; CHECK-NEXT: DIVERGENT: %row = mul i64 %n, %indvars.iv +; CHECK-NEXT: DIVERGENT: %idx = add i64 %row, %indvars.iv2 +; CHECK-NEXT: DIVERGENT: %trunc = trunc i64 %idx to i32 +; CHECK-NEXT: DIVERGENT: %val = sitofp i32 %trunc to float +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds float, float* %ptr, i64 %idx +; CHECK-NEXT: DIVERGENT: store float %val, float* %arrayidx, align 4 +; CHECK-NEXT: DIVERGENT: %exitcond2 = icmp sge i64 %indvars.iv.next2, %indvars.iv +; CHECK-NEXT: DIVERGENT: br i1 %exitcond2, label %for.latch, label %for.body2 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-NEXT: } +; CHECK: Divergence of loop for.body2 { +; CHECK-NEXT: DIVERGENT: %indvars.iv2 = phi i64 [ 0, %for.body ], [ %indvars.iv.next2, %for.body2 ] +; CHECK-NEXT: DIVERGENT: %idx = add i64 %row, %indvars.iv2 +; CHECK-NEXT: DIVERGENT: %trunc = trunc i64 %idx to i32 +; CHECK-NEXT: DIVERGENT: %val = sitofp i32 %trunc to float +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds float, float* %ptr, i64 %idx +; CHECK-NEXT: DIVERGENT: store float %val, float* %arrayidx, align 4 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next2 = add nuw nsw i64 %indvars.iv2, 1 +; CHECK-NEXT: } + +define void @test1(float* nocapture %ptr, i64 %n) #0 { + entry: + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body.lr.ph, label %exit + +for.body.lr.ph: ; preds = %entry + br label %for.body + +exit: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.latch ] + br label %for.body2 + +for.body2: + %indvars.iv2 = phi i64 [ 0, %for.body ], [ %indvars.iv.next2, %for.body2 ] + %row = mul i64 %n, %indvars.iv + %idx = add i64 %row, %indvars.iv2 + %trunc = trunc i64 %idx to i32 + %val = sitofp i32 %trunc to float + %arrayidx = getelementptr inbounds float, float* %ptr, i64 %idx + store float %val, float* %arrayidx, align 4 + %indvars.iv.next2 = add nuw nsw i64 %indvars.iv2, 1 + %exitcond2 = icmp sge i64 %indvars.iv.next2, %indvars.iv + br i1 %exitcond2, label %for.latch, label %for.body2 + +for.latch: + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond, label %exit, label %for.body +} + +attributes #0 = { nounwind } + Index: test/Analysis/DivergenceAnalysis/Loops/LoopWithLI.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/Loops/LoopWithLI.ll @@ -0,0 +1,31 @@ +; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s + +; CHECK: Divergence of loop for.body { +; CHECK-NEXT: DIVERGENT: %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv +; CHECK-NEXT: DIVERGENT: store float %cast, float* %arrayidx, align 4 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-NEXT: } +define void @test1(float* nocapture %A, i64 %n) #0 { + entry: + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv + %trunc = trunc i64 %n to i32 + %cast = sitofp i32 %trunc to float + store float %cast, float* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +attributes #0 = { nounwind } Index: test/Analysis/DivergenceAnalysis/Loops/LoopWithUniBranch.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/Loops/LoopWithUniBranch.ll @@ -0,0 +1,39 @@ +; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s + +; CHECK: Divergence of loop for.body { +; CHECK-NEXT: DIVERGENT: %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %B ] +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds float, float* %ptr, i64 %indvars.iv +; CHECK-NEXT: DIVERGENT: store float %divphi, float* %arrayidx, align 4 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-NEXT: } +define void @test1(float* nocapture %ptr, i64 %n) #0 { + entry: + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %B ] + %invar = trunc i64 %n to i1 + br i1 %invar, label %A, label %B + +A: + %trunc = trunc i64 %n to i32 + %cast = sitofp i32 %trunc to float + br label %B + +B: + %divphi = phi float [ %cast, %A ], [ 4.200000e+01, %for.body ] + %arrayidx = getelementptr inbounds float, float* %ptr, i64 %indvars.iv + store float %divphi, float* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +attributes #0 = { nounwind } Index: test/Analysis/DivergenceAnalysis/Loops/LoopWithUniLoop.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/Loops/LoopWithUniLoop.ll @@ -0,0 +1,55 @@ +; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s + +; CHECK: Divergence of loop for.body { +; CHECK-NEXT: DIVERGENT: %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.latch ] +; CHECK-NEXT: DIVERGENT: %row = mul i64 %n, %indvars.iv +; CHECK-NEXT: DIVERGENT: %idx = add i64 %row, %indvars.iv2 +; CHECK-NEXT: DIVERGENT: %trunc = trunc i64 %idx to i32 +; CHECK-NEXT: DIVERGENT: %val = sitofp i32 %trunc to float +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds float, float* %ptr, i64 %idx +; CHECK-NEXT: DIVERGENT: store float %val, float* %arrayidx, align 4 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-NEXT: } +; CHECK: Divergence of loop for.body2 { +; CHECK-NEXT: DIVERGENT: %indvars.iv2 = phi i64 [ 0, %for.body ], [ %indvars.iv.next2, %for.body2 ] +; CHECK-NEXT: DIVERGENT: %idx = add i64 %row, %indvars.iv2 +; CHECK-NEXT: DIVERGENT: %trunc = trunc i64 %idx to i32 +; CHECK-NEXT: DIVERGENT: %val = sitofp i32 %trunc to float +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds float, float* %ptr, i64 %idx +; CHECK-NEXT: DIVERGENT: store float %val, float* %arrayidx, align 4 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next2 = add nuw nsw i64 %indvars.iv2, 1 +; CHECK-NEXT: } +define void @test1(float* nocapture %ptr, i64 %n) #0 { + entry: + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body.lr.ph, label %exit + +for.body.lr.ph: ; preds = %entry + br label %for.body + +exit: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.latch ] + br label %for.body2 + +for.body2: + %indvars.iv2 = phi i64 [ 0, %for.body ], [ %indvars.iv.next2, %for.body2 ] + %row = mul i64 %n, %indvars.iv + %idx = add i64 %row, %indvars.iv2 + %trunc = trunc i64 %idx to i32 + %val = sitofp i32 %trunc to float + %arrayidx = getelementptr inbounds float, float* %ptr, i64 %idx + store float %val, float* %arrayidx, align 4 + %indvars.iv.next2 = add nuw nsw i64 %indvars.iv2, 1 + %exitcond2 = icmp eq i64 %indvars.iv.next2, %n + br i1 %exitcond2, label %for.latch, label %for.body2 + +for.latch: + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond, label %exit, label %for.body +} + +attributes #0 = { nounwind } Index: test/Analysis/DivergenceAnalysis/Loops/NonAffineUniLoop.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/Loops/NonAffineUniLoop.ll @@ -0,0 +1,110 @@ +; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s + +; CHECK: Divergence of loop for.body { +; CHECK-NEXT: DIVERGENT: %indvars.iv53 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next54, %for.cond.cleanup3 ] +; CHECK-NEXT: DIVERGENT: %indvars.iv.next54 = add nuw nsw i64 %indvars.iv53, 1 +; CHECK-NEXT: DIVERGENT: %5 = add nsw i64 %4, %indvars.iv53 +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds double, double* %A, i64 %5 +; CHECK-NEXT: DIVERGENT: %6 = load double, double* %arrayidx, align 8 +; CHECK-NEXT: DIVERGENT: %8 = add nsw i64 %7, %indvars.iv53 +; CHECK-NEXT: DIVERGENT: %arrayidx14 = getelementptr inbounds double, double* %A, i64 %8 +; CHECK-NEXT: DIVERGENT: %9 = load double, double* %arrayidx14, align 8 +; CHECK-NEXT: DIVERGENT: %add15 = fadd double %6, %9 +; CHECK-NEXT: DIVERGENT: store double %add15, double* %arrayidx14, align 8 +; CHECK-NEXT: } +; CHECK: Divergence of loop for.body8.lr.ph { +; CHECK-NEXT: DIVERGENT: %mul44 = phi i32 [ %mul, %for.cond.cleanup7 ], [ 2, %for.body8.lr.ph.preheader ] +; CHECK-NEXT: DIVERGENT: %len.043 = phi i32 [ %mul44, %for.cond.cleanup7 ], [ 1, %for.body8.lr.ph.preheader ] +; CHECK-NEXT: DIVERGENT: %1 = sext i32 %mul44 to i64 +; CHECK-NEXT: DIVERGENT: %2 = sext i32 %len.043 to i64 +; CHECK-NEXT: DIVERGENT: %mul = shl nsw i32 %mul44, 1 +; CHECK-NEXT: DIVERGENT: %indvars.iv = phi i64 [ 0, %for.body8.lr.ph ], [ %indvars.iv.next, %for.body8 ] +; CHECK-NEXT: DIVERGENT: %3 = add nsw i64 %indvars.iv, %2 +; CHECK-NEXT: DIVERGENT: %4 = mul nsw i64 %3, %0 +; CHECK-NEXT: DIVERGENT: %5 = add nsw i64 %4, %indvars.iv53 +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds double, double* %A, i64 %5 +; CHECK-NEXT: DIVERGENT: %6 = load double, double* %arrayidx, align 8 +; CHECK-NEXT: DIVERGENT: %7 = mul nsw i64 %indvars.iv, %0 +; CHECK-NEXT: DIVERGENT: %8 = add nsw i64 %7, %indvars.iv53 +; CHECK-NEXT: DIVERGENT: %arrayidx14 = getelementptr inbounds double, double* %A, i64 %8 +; CHECK-NEXT: DIVERGENT: %9 = load double, double* %arrayidx14, align 8 +; CHECK-NEXT: DIVERGENT: %add15 = fadd double %6, %9 +; CHECK-NEXT: DIVERGENT: store double %add15, double* %arrayidx14, align 8 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next = add i64 %indvars.iv, %1 +; CHECK-NEXT: DIVERGENT: %cmp6 = icmp slt i64 %indvars.iv.next, %0 +; CHECK-NEXT: DIVERGENT: br i1 %cmp6, label %for.body8, label %for.cond.cleanup7 +; CHECK-NEXT: } +; CHECK: Divergence of loop for.body8 { +; CHECK-NEXT: DIVERGENT: %indvars.iv = phi i64 [ 0, %for.body8.lr.ph ], [ %indvars.iv.next, %for.body8 ] +; CHECK-NEXT: DIVERGENT: %3 = add nsw i64 %indvars.iv, %2 +; CHECK-NEXT: DIVERGENT: %4 = mul nsw i64 %3, %0 +; CHECK-NEXT: DIVERGENT: %5 = add nsw i64 %4, %indvars.iv53 +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds double, double* %A, i64 %5 +; CHECK-NEXT: DIVERGENT: %6 = load double, double* %arrayidx, align 8 +; CHECK-NEXT: DIVERGENT: %7 = mul nsw i64 %indvars.iv, %0 +; CHECK-NEXT: DIVERGENT: %8 = add nsw i64 %7, %indvars.iv53 +; CHECK-NEXT: DIVERGENT: %arrayidx14 = getelementptr inbounds double, double* %A, i64 %8 +; CHECK-NEXT: DIVERGENT: %9 = load double, double* %arrayidx14, align 8 +; CHECK-NEXT: DIVERGENT: %add15 = fadd double %6, %9 +; CHECK-NEXT: DIVERGENT: store double %add15, double* %arrayidx14, align 8 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next = add i64 %indvars.iv, %1 +; CHECK-NEXT: } + +; Function Attrs: norecurse nounwind uwtable +define void @foo(double* nocapture %A, i32 %n) local_unnamed_addr #0 { +entry: + %cmp45 = icmp sgt i32 %n, 0 + br i1 %cmp45, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %cmp242 = icmp sgt i32 %n, 2 + %0 = sext i32 %n to i64 + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup3, %entry + ret void + +for.body: ; preds = %for.cond.cleanup3, %for.body.lr.ph + %indvars.iv53 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next54, %for.cond.cleanup3 ] + br i1 %cmp242, label %for.body8.lr.ph.preheader, label %for.cond.cleanup3 + +for.body8.lr.ph.preheader: ; preds = %for.body + br label %for.body8.lr.ph + +for.cond.cleanup3: ; preds = %for.cond.cleanup7, %for.body + %indvars.iv.next54 = add nuw nsw i64 %indvars.iv53, 1 + %exitcond = icmp eq i64 %indvars.iv.next54, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.body8.lr.ph: ; preds = %for.body8.lr.ph.preheader, %for.cond.cleanup7 + %mul44 = phi i32 [ %mul, %for.cond.cleanup7 ], [ 2, %for.body8.lr.ph.preheader ] + %len.043 = phi i32 [ %mul44, %for.cond.cleanup7 ], [ 1, %for.body8.lr.ph.preheader ] + %1 = sext i32 %mul44 to i64 + %2 = sext i32 %len.043 to i64 + br label %for.body8 + +for.cond.cleanup7: ; preds = %for.body8 + %mul = shl nsw i32 %mul44, 1 + %cmp2 = icmp slt i32 %mul, %n + br i1 %cmp2, label %for.body8.lr.ph, label %for.cond.cleanup3 + +for.body8: ; preds = %for.body8.lr.ph, %for.body8 + %indvars.iv = phi i64 [ 0, %for.body8.lr.ph ], [ %indvars.iv.next, %for.body8 ] + %3 = add nsw i64 %indvars.iv, %2 + %4 = mul nsw i64 %3, %0 + %5 = add nsw i64 %4, %indvars.iv53 + %arrayidx = getelementptr inbounds double, double* %A, i64 %5 + %6 = load double, double* %arrayidx, align 8 + %7 = mul nsw i64 %indvars.iv, %0 + %8 = add nsw i64 %7, %indvars.iv53 + %arrayidx14 = getelementptr inbounds double, double* %A, i64 %8 + %9 = load double, double* %arrayidx14, align 8 + %add15 = fadd double %6, %9 + store double %add15, double* %arrayidx14, align 8 + %indvars.iv.next = add i64 %indvars.iv, %1 + %cmp6 = icmp slt i64 %indvars.iv.next, %0 + br i1 %cmp6, label %for.body8, label %for.cond.cleanup7 +} + +attributes #0 = { norecurse nounwind uwtable } Index: test/Analysis/DivergenceAnalysis/Loops/SingleBlockLoop.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/Loops/SingleBlockLoop.ll @@ -0,0 +1,29 @@ +; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s + +; CHECK: Divergence of loop for.body { +; CHECK-NEXT: DIVERGENT: %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv +; CHECK-NEXT: DIVERGENT: store float 4.200000e+01, float* %arrayidx, align 4 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-NEXT: } +define void @test1(float* nocapture %A, i64 %n) #0 { + entry: + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv + store float 4.200000e+01, float* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +attributes #0 = { nounwind } Index: test/Analysis/DivergenceAnalysis/NVPTX/daorder.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/NVPTX/daorder.ll @@ -0,0 +1,47 @@ +; RUN: opt %s -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +define i32 @daorder(i32 %n) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'daorder' +entry: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %cond = icmp slt i32 %tid, 0 + br i1 %cond, label %A, label %B ; divergent +; CHECK: DIVERGENT: br i1 %cond, +A: + %defAtA = add i32 %n, 1 ; uniform +; CHECK-NOT: DIVERGENT: %defAtA = + br label %C +B: + %defAtB = add i32 %n, 2 ; uniform +; CHECK-NOT: DIVERGENT: %defAtB = + br label %C +C: + %defAtC = phi i32 [ %defAtA, %A ], [ %defAtB, %B ] ; divergent +; CHECK: DIVERGENT: %defAtC = + br label %D + +D: + %i = phi i32 [0, %C], [ %i.inc, %E ] ; uniform +; CHECK-NOT: DIVERGENT: %i = phi + br label %E + +E: + %i.inc = add i32 %i, 1 + %loopCnt = icmp slt i32 %i.inc, %n +; CHECK-NOT: DIVERGENT: %loopCnt = + br i1 %loopCnt, label %D, label %exit + +exit: + ret i32 %n +} + +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() +declare i32 @llvm.nvvm.read.ptx.sreg.laneid() + +!nvvm.annotations = !{!0} +!0 = !{i32 (i32)* @daorder, !"kernel", i32 1} Index: test/Analysis/DivergenceAnalysis/NVPTX/diverge.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/NVPTX/diverge.ll @@ -0,0 +1,175 @@ +; RUN: opt %s -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; return (n < 0 ? a + threadIdx.x : b + threadIdx.x) +define i32 @no_diverge(i32 %n, i32 %a, i32 %b) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'no_diverge' +entry: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %cond = icmp slt i32 %n, 0 + br i1 %cond, label %then, label %else ; uniform +; CHECK-NOT: DIVERGENT: br i1 %cond, +then: + %a1 = add i32 %a, %tid + br label %merge +else: + %b2 = add i32 %b, %tid + br label %merge +merge: + %c = phi i32 [ %a1, %then ], [ %b2, %else ] + ret i32 %c +} + +; c = a; +; if (threadIdx.x < 5) // divergent: data dependent +; c = b; +; return c; // c is divergent: sync dependent +define i32 @sync(i32 %a, i32 %b) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'sync' +bb1: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.y() + %cond = icmp slt i32 %tid, 5 + br i1 %cond, label %bb2, label %bb3 +; CHECK: DIVERGENT: br i1 %cond, +bb2: + br label %bb3 +bb3: + %c = phi i32 [ %a, %bb1 ], [ %b, %bb2 ] ; sync dependent on tid +; CHECK: DIVERGENT: %c = + ret i32 %c +} + +; c = 0; +; if (threadIdx.x >= 5) { // divergent +; c = (n < 0 ? a : b); // c here is uniform because n is uniform +; } +; // c here is divergent because it is sync dependent on threadIdx.x >= 5 +; return c; +define i32 @mixed(i32 %n, i32 %a, i32 %b) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'mixed' +bb1: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.z() + %cond = icmp slt i32 %tid, 5 + br i1 %cond, label %bb6, label %bb2 +; CHECK: DIVERGENT: br i1 %cond, +bb2: + %cond2 = icmp slt i32 %n, 0 + br i1 %cond2, label %bb4, label %bb3 +bb3: + br label %bb5 +bb4: + br label %bb5 +bb5: + %c = phi i32 [ %a, %bb3 ], [ %b, %bb4 ] +; CHECK-NOT: DIVERGENT: %c = + br label %bb6 +bb6: + %c2 = phi i32 [ 0, %bb1], [ %c, %bb5 ] +; CHECK: DIVERGENT: %c2 = + ret i32 %c2 +} + +; We conservatively treats all parameters of a __device__ function as divergent. +define i32 @device(i32 %n, i32 %a, i32 %b) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'device' +; CHECK: DIVERGENT: i32 %n +; CHECK: DIVERGENT: i32 %a +; CHECK: DIVERGENT: i32 %b +entry: + %cond = icmp slt i32 %n, 0 + br i1 %cond, label %then, label %else +; CHECK: DIVERGENT: br i1 %cond, +then: + br label %merge +else: + br label %merge +merge: + %c = phi i32 [ %a, %then ], [ %b, %else ] + ret i32 %c +} + +; int i = 0; +; do { +; i++; // i here is uniform +; } while (i < laneid); +; return i == 10 ? 0 : 1; // i here is divergent +; +; The i defined in the loop is used outside. +define i32 @loop() { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'loop' +entry: + %laneid = call i32 @llvm.nvvm.read.ptx.sreg.laneid() + br label %loop +loop: + %i = phi i32 [ 0, %entry ], [ %i1, %loop ] +; CHECK-NOT: DIVERGENT: %i = + %i1 = add i32 %i, 1 + %exit_cond = icmp sge i32 %i1, %laneid + br i1 %exit_cond, label %loop_exit, label %loop +loop_exit: + %cond = icmp eq i32 %i, 10 + br i1 %cond, label %then, label %else +; CHECK: DIVERGENT: br i1 %cond, +then: + ret i32 0 +else: + ret i32 1 +} + +; Same as @loop, but the loop is in the LCSSA form. +define i32 @lcssa() { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'lcssa' +entry: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + br label %loop +loop: + %i = phi i32 [ 0, %entry ], [ %i1, %loop ] +; CHECK-NOT: DIVERGENT: %i = + %i1 = add i32 %i, 1 + %exit_cond = icmp sge i32 %i1, %tid + br i1 %exit_cond, label %loop_exit, label %loop +loop_exit: + %i.lcssa = phi i32 [ %i, %loop ] +; CHECK: DIVERGENT: %i.lcssa = + %cond = icmp eq i32 %i.lcssa, 10 + br i1 %cond, label %then, label %else +; CHECK: DIVERGENT: br i1 %cond, +then: + ret i32 0 +else: + ret i32 1 +} + +; Verifies sync-dependence is computed correctly in the absense of loops. +define i32 @sync_no_loop(i32 %arg) { +entry: + %0 = add i32 %arg, 1 + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %1 = icmp sge i32 %tid, 10 + br i1 %1, label %bb1, label %bb2 + +bb1: + br label %bb3 + +bb2: + br label %bb3 + +bb3: + %2 = add i32 %0, 2 + ; CHECK-NOT: DIVERGENT: %2 + ret i32 %2 +} + +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() +declare i32 @llvm.nvvm.read.ptx.sreg.laneid() + +!nvvm.annotations = !{!0, !1, !2, !3, !4} +!0 = !{i32 (i32, i32, i32)* @no_diverge, !"kernel", i32 1} +!1 = !{i32 (i32, i32)* @sync, !"kernel", i32 1} +!2 = !{i32 (i32, i32, i32)* @mixed, !"kernel", i32 1} +!3 = !{i32 ()* @loop, !"kernel", i32 1} +!4 = !{i32 (i32)* @sync_no_loop, !"kernel", i32 1} Index: test/Analysis/DivergenceAnalysis/NVPTX/hidden_diverge.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/NVPTX/hidden_diverge.ll @@ -0,0 +1,30 @@ +; RUN: opt %s -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +define i32 @hidden_diverge(i32 %n, i32 %a, i32 %b) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_diverge' +entry: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %cond.var = icmp slt i32 %tid, 0 + br i1 %cond.var, label %B, label %C ; divergent +; CHECK: DIVERGENT: br i1 %cond.var, +B: + %cond.uni = icmp slt i32 %n, 0 + br i1 %cond.uni, label %C, label %merge ; uniform +; CHECK-NOT: DIVERGENT: br i1 %cond.uni, +C: + %phi.var.hidden = phi i32 [ 1, %entry ], [ 2, %B ] +; CHECK: DIVERGENT: %phi.var.hidden = phi i32 + br label %merge +merge: + %phi.ipd = phi i32 [ %a, %B ], [ %b, %C ] +; CHECK: DIVERGENT: %phi.ipd = phi i32 + ret i32 %phi.ipd +} + +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() + +!nvvm.annotations = !{!0} +!0 = !{i32 (i32, i32, i32)* @hidden_diverge, !"kernel", i32 1} Index: test/Analysis/DivergenceAnalysis/NVPTX/irreducible.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/NVPTX/irreducible.ll @@ -0,0 +1,55 @@ +; RUN: opt %s -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; This test contains an unstructured loop. +; +-------------- entry ----------------+ +; | | +; V V +; i1 = phi(0, i3) i2 = phi(0, i3) +; j1 = i1 + 1 ---> i3 = phi(j1, j2) <--- j2 = i2 + 2 +; ^ | ^ +; | V | +; +-------- switch (tid / i3) ----------+ +; | +; V +; if (i3 == 5) // divergent +; because sync dependent on (tid / i3). +define i32 @unstructured_loop(i1 %entry_cond) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'unstructured_loop' +entry: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + br i1 %entry_cond, label %loop_entry_1, label %loop_entry_2 +loop_entry_1: + %i1 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ] + %j1 = add i32 %i1, 1 + br label %loop_body +loop_entry_2: + %i2 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ] + %j2 = add i32 %i2, 2 + br label %loop_body +loop_body: + %i3 = phi i32 [ %j1, %loop_entry_1 ], [ %j2, %loop_entry_2 ] + br label %loop_latch +loop_latch: + %div = sdiv i32 %tid, %i3 + switch i32 %div, label %branch [ i32 1, label %loop_entry_1 + i32 2, label %loop_entry_2 ] +branch: + %cmp = icmp eq i32 %i3, 5 + br i1 %cmp, label %then, label %else +; CHECK: DIVERGENT: br i1 %cmp, +then: + ret i32 0 +else: + ret i32 1 +} + +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() +declare i32 @llvm.nvvm.read.ptx.sreg.laneid() + +!nvvm.annotations = !{!0} +!0 = !{i32 (i1)* @unstructured_loop, !"kernel", i32 1} Index: test/Analysis/DivergenceAnalysis/NVPTX/lit.local.cfg =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/NVPTX/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'NVPTX' in config.root.targets: + config.unsupported = True