Index: include/llvm/Analysis/DivergenceAnalysis.h =================================================================== --- include/llvm/Analysis/DivergenceAnalysis.h +++ include/llvm/Analysis/DivergenceAnalysis.h @@ -173,6 +173,33 @@ std::vector Worklist; }; +/// \brief Divergence analysis frontend for GPU kernels. +class GPUDivergenceAnalysis { + SyncDependenceAnalysis SDA; + DivergenceAnalysis DA; + +public: + /// Runs the divergence analysis on @F, a GPU kernel + GPUDivergenceAnalysis(Function &F, const DominatorTree &DT, + const PostDominatorTree &PDT, const LoopInfo &LI, + const TargetTransformInfo &TTI); + + /// Whether any divergence was detected. + bool hasDivergence() const { return DA.hasDetectedDivergence(); } + + /// The GPU kernel this analysis result is for + const Function &getFunction() const { return DA.getFunction(); } + + /// Whether \p V is divergent. + bool isDivergent(const Value &V) const; + + /// Whether \p V is uniform/non-divergent + bool isUniform(const Value &V) const { return !isDivergent(V); } + + /// Print all divergent values in the kernel. + void print(raw_ostream &OS, const Module *) const; +}; + } // namespace llvm #endif // LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H Index: include/llvm/Analysis/LegacyDivergenceAnalysis.h =================================================================== --- include/llvm/Analysis/LegacyDivergenceAnalysis.h +++ include/llvm/Analysis/LegacyDivergenceAnalysis.h @@ -19,9 +19,11 @@ #include "llvm/ADT/DenseSet.h" #include "llvm/IR/Function.h" #include "llvm/Pass.h" +#include "llvm/Analysis/DivergenceAnalysis.h" namespace llvm { class Value; +class GPUDivergenceAnalysis; class LegacyDivergenceAnalysis : public FunctionPass { public: static char ID; @@ -41,7 +43,7 @@ // // Even if this function returns false, V may still be divergent when used // in a different basic block. - bool isDivergent(const Value *V) const { return DivergentValues.count(V); } + bool isDivergent(const Value *V) const; // Returns true if V is uniform/non-divergent. // @@ -53,6 +55,12 @@ void removeValue(const Value *V) { DivergentValues.erase(V); } private: + // Whether analysis should be performed by GPUDivergenceAnalysis. + bool shouldUseGPUDivergenceAnalysis(const Function &F) const; + + // (optional) handle to new DivergenceAnalysis + std::unique_ptr gpuDA; + // Stores all divergent values. DenseSet DivergentValues; }; Index: lib/Analysis/DivergenceAnalysis.cpp =================================================================== --- lib/Analysis/DivergenceAnalysis.cpp +++ lib/Analysis/DivergenceAnalysis.cpp @@ -422,3 +422,36 @@ OS << "DIVERGENT:" << I << '\n'; } } + +// class GPUDivergenceAnalysis +GPUDivergenceAnalysis::GPUDivergenceAnalysis(Function &F, + const DominatorTree &DT, + const PostDominatorTree &PDT, + const LoopInfo &LI, + const TargetTransformInfo &TTI) + : SDA(DT, PDT, LI), DA(F, nullptr, DT, LI, SDA, false) { + for (auto &I : instructions(F)) { + if (TTI.isSourceOfDivergence(&I)) { + DA.markDivergent(I); + } else if (TTI.isAlwaysUniform(&I)) { + DA.addUniformOverride(I); + } + } + for (auto &Arg : F.args()) { + if (TTI.isSourceOfDivergence(&Arg)) { + DA.markDivergent(Arg); + } + } + + DA.compute(); +} + +bool GPUDivergenceAnalysis::isDivergent(const Value &val) const { + return DA.isDivergent(val); +} + +void GPUDivergenceAnalysis::print(raw_ostream &OS, const Module *mod) const { + OS << "Divergence of kernel " << DA.getFunction().getName() << " {\n"; + DA.print(OS, mod); + OS << "}\n"; +} Index: lib/Analysis/LegacyDivergenceAnalysis.cpp =================================================================== --- lib/Analysis/LegacyDivergenceAnalysis.cpp +++ lib/Analysis/LegacyDivergenceAnalysis.cpp @@ -1,4 +1,5 @@ -//===- LegacyDivergenceAnalysis.cpp --------- Legacy Divergence Analysis Implementation -==// +//===- LegacyDivergenceAnalysis.cpp --------- Legacy Divergence Analysis +//Implementation -==// // // The LLVM Compiler Infrastructure // @@ -64,6 +65,9 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/Passes.h" #include "llvm/Analysis/PostDominators.h" @@ -79,6 +83,12 @@ #define DEBUG_TYPE "divergence" +// transparently use the GPUDivergenceAnalysis +static cl::opt UseGPUDA("use-gpu-divergence-analysis", cl::init(false), + cl::Hidden, + cl::desc("turn the LegacyDivergenceAnalysis into " + "a wrapper for GPUDivergenceAnalysis")); + namespace { class DivergencePropagator { @@ -262,16 +272,17 @@ } } -} /// end namespace anonymous +} // namespace // Register this pass. char LegacyDivergenceAnalysis::ID = 0; -INITIALIZE_PASS_BEGIN(LegacyDivergenceAnalysis, "divergence", "Legacy Divergence Analysis", - false, true) +INITIALIZE_PASS_BEGIN(LegacyDivergenceAnalysis, "divergence", + "Legacy Divergence Analysis", false, true) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) -INITIALIZE_PASS_END(LegacyDivergenceAnalysis, "divergence", "Legacy Divergence Analysis", - false, true) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_END(LegacyDivergenceAnalysis, "divergence", + "Legacy Divergence Analysis", false, true) FunctionPass *llvm::createLegacyDivergenceAnalysisPass() { return new LegacyDivergenceAnalysis(); @@ -280,9 +291,24 @@ void LegacyDivergenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.addRequired(); + if (UseGPUDA) + AU.addRequired(); AU.setPreservesAll(); } +bool LegacyDivergenceAnalysis::shouldUseGPUDivergenceAnalysis( + const Function &F) const { + if (!UseGPUDA) + return false; + + // GPUDivergenceAnalysis requires a reducible CFG. + auto &LI = getAnalysis().getLoopInfo(); + using RPOTraversal = ReversePostOrderTraversal; + RPOTraversal FuncRPOT(&F); + return !containsIrreducibleCFG(FuncRPOT, LI); +} + bool LegacyDivergenceAnalysis::runOnFunction(Function &F) { auto *TTIWP = getAnalysisIfAvailable(); if (TTIWP == nullptr) @@ -295,36 +321,59 @@ return false; DivergentValues.clear(); + gpuDA = nullptr; + + auto &DT = getAnalysis().getDomTree(); auto &PDT = getAnalysis().getPostDomTree(); - DivergencePropagator DP(F, TTI, - getAnalysis().getDomTree(), - PDT, DivergentValues); - DP.populateWithSourcesOfDivergence(); - DP.propagate(); - LLVM_DEBUG( - dbgs() << "\nAfter divergence analysis on " << F.getName() << ":\n"; - print(dbgs(), F.getParent()) - ); + + if (shouldUseGPUDivergenceAnalysis(F)) { + // run the new GPU divergence analysis + auto &LI = getAnalysis().getLoopInfo(); + gpuDA = llvm::make_unique(F, DT, PDT, LI, TTI); + + } else { + // run LLVM's existing DivergenceAnalysis + DivergencePropagator DP(F, TTI, DT, PDT, DivergentValues); + DP.populateWithSourcesOfDivergence(); + DP.propagate(); + } + + LLVM_DEBUG(dbgs() << "\nAfter divergence analysis on " << F.getName() + << ":\n"; + print(dbgs(), F.getParent())); + return false; } +bool LegacyDivergenceAnalysis::isDivergent(const Value *V) const { + if (gpuDA) { + return gpuDA->isDivergent(*V); + } + return DivergentValues.count(V); +} + void LegacyDivergenceAnalysis::print(raw_ostream &OS, const Module *) const { - if (DivergentValues.empty()) + if ((!gpuDA || !gpuDA->hasDivergence()) && DivergentValues.empty()) return; - const Value *FirstDivergentValue = *DivergentValues.begin(); + const Function *F; - if (const Argument *Arg = dyn_cast(FirstDivergentValue)) { - F = Arg->getParent(); - } else if (const Instruction *I = - dyn_cast(FirstDivergentValue)) { - F = I->getParent()->getParent(); - } else { - llvm_unreachable("Only arguments and instructions can be divergent"); + if (!DivergentValues.empty()) { + const Value *FirstDivergentValue = *DivergentValues.begin(); + if (const Argument *Arg = dyn_cast(FirstDivergentValue)) { + F = Arg->getParent(); + } else if (const Instruction *I = + dyn_cast(FirstDivergentValue)) { + F = I->getParent()->getParent(); + } else { + llvm_unreachable("Only arguments and instructions can be divergent"); + } + } else if (gpuDA) { + F = &gpuDA->getFunction(); } // Dumps all divergent values in F, arguments and then instructions. for (auto &Arg : F->args()) { - OS << (DivergentValues.count(&Arg) ? "DIVERGENT: " : " "); + OS << (isDivergent(&Arg) ? "DIVERGENT: " : " "); OS << Arg << "\n"; } // Iterate instructions using instructions() to ensure a deterministic order. @@ -332,7 +381,7 @@ auto &BB = *BI; OS << "\n " << BB.getName() << ":\n"; for (auto &I : BB.instructionsWithoutDebug()) { - OS << (DivergentValues.count(&I) ? "DIVERGENT: " : " "); + OS << (isDivergent(&I) ? "DIVERGENT: " : " "); OS << I << "\n"; } } Index: test/Analysis/DivergenceAnalysis/AMDGPU/always_uniform.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/always_uniform.ll @@ -0,0 +1,14 @@ +; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +define amdgpu_kernel void @workitem_id_x() #1 { + %id.x = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK: DIVERGENT: %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %first.lane = call i32 @llvm.amdgcn.readfirstlane(i32 %id.x) +; CHECK-NOT: DIVERGENT: %first.lane = call i32 @llvm.amdgcn.readfirstlane(i32 %id.x) + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 +declare i32 @llvm.amdgcn.readfirstlane(i32) #0 + +attributes #0 = { nounwind readnone } Index: test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll @@ -0,0 +1,45 @@ +; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +; CHECK: DIVERGENT: %orig = atomicrmw xchg i32* %ptr, i32 %val seq_cst +define i32 @test1(i32* %ptr, i32 %val) #0 { + %orig = atomicrmw xchg i32* %ptr, i32 %val seq_cst + ret i32 %orig +} + +; CHECK: DIVERGENT: %orig = cmpxchg i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst +define {i32, i1} @test2(i32* %ptr, i32 %cmp, i32 %new) { + %orig = cmpxchg i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst + ret {i32, i1} %orig +} + +; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false) +define i32 @test_atomic_inc_i32(i32 addrspace(1)* %ptr, i32 %val) #0 { + %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false) + ret i32 %ret +} + +; CHECK: DIVERGENT: %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false) +define i64 @test_atomic_inc_i64(i64 addrspace(1)* %ptr, i64 %val) #0 { + %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false) + ret i64 %ret +} + +; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false) +define i32 @test_atomic_dec_i32(i32 addrspace(1)* %ptr, i32 %val) #0 { + %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false) + ret i32 %ret +} + +; CHECK: DIVERGENT: %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false) +define i64 @test_atomic_dec_i64(i64 addrspace(1)* %ptr, i64 %val) #0 { + %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false) + ret i64 %ret +} + +declare i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32, i32, i1) #1 +declare i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32, i32, i1) #1 +declare i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32, i32, i1) #1 +declare i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32, i32, i1) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind argmemonly } Index: test/Analysis/DivergenceAnalysis/AMDGPU/hidden_diverge.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/hidden_diverge.ll @@ -0,0 +1,26 @@ +; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +define amdgpu_kernel void @hidden_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_diverge' +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %cond.var = icmp slt i32 %tid, 0 + br i1 %cond.var, label %B, label %C ; divergent +; CHECK: DIVERGENT: br i1 %cond.var, +B: + %cond.uni = icmp slt i32 %n, 0 + br i1 %cond.uni, label %C, label %merge ; uniform +; CHECK-NOT: DIVERGENT: br i1 %cond.uni, +C: + %phi.var.hidden = phi i32 [ 1, %entry ], [ 2, %B ] +; CHECK: DIVERGENT: %phi.var.hidden = phi i32 + br label %merge +merge: + %phi.ipd = phi i32 [ %a, %B ], [ %b, %C ] +; CHECK: DIVERGENT: %phi.ipd = phi i32 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } Index: test/Analysis/DivergenceAnalysis/AMDGPU/hidden_loopdiverge.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/hidden_loopdiverge.ll @@ -0,0 +1,223 @@ +; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +; divergent loop (H
, B) +; the divergent join point in %exit is obscured by uniform control joining in %X +define amdgpu_kernel void @hidden_loop_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_loop_diverge': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br i1 %uni.cond, label %X, label %H ; uniform + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %B ] + %div.exitx = icmp slt i32 %tid, 0 + br i1 %div.exitx, label %X, label %B ; divergent branch +; CHECK: DIVERGENT: %div.exitx = +; CHECK: DIVERGENT: br i1 %div.exitx, + +B: + %uni.inc = add i32 %uni.merge.h, 1 + %div.exity = icmp sgt i32 %tid, 0 + br i1 %div.exity, label %Y, label %H ; divergent branch +; CHECK: DIVERGENT: %div.exity = +; CHECK: DIVERGENT: br i1 %div.exity, + +X: + %div.merge.x = phi i32 [ %a, %entry ], [ %uni.merge.h, %H ] ; temporal divergent phi + br i1 %uni.cond, label %Y, label %exit +; CHECK: DIVERGENT: %div.merge.x = + +Y: + %div.merge.y = phi i32 [ 42, %X ], [ %b, %B ] + br label %exit +; CHECK: DIVERGENT: %div.merge.y = + +exit: + %div.merge.exit = phi i32 [ %a, %X ], [ %b, %Y ] + ret void +; CHECK: DIVERGENT: %div.merge.exit = +} + +; divergent loop (H
, B) +; the phi nodes in X and Y don't actually receive divergent values +define amdgpu_kernel void @unobserved_loop_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'unobserved_loop_diverge': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br i1 %uni.cond, label %X, label %H ; uniform + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %B ] + %div.exitx = icmp slt i32 %tid, 0 + br i1 %div.exitx, label %X, label %B ; divergent branch +; CHECK: DIVERGENT: %div.exitx = +; CHECK: DIVERGENT: br i1 %div.exitx, + +B: + %uni.inc = add i32 %uni.merge.h, 1 + %div.exity = icmp sgt i32 %tid, 0 + br i1 %div.exity, label %Y, label %H ; divergent branch +; CHECK: DIVERGENT: %div.exity = +; CHECK: DIVERGENT: br i1 %div.exity, + +X: + %uni.merge.x = phi i32 [ %a, %entry ], [ %b, %H ] + br label %exit + +Y: + %uni.merge.y = phi i32 [ %b, %B ] + br label %exit + +exit: + %div.merge.exit = phi i32 [ %a, %X ], [ %b, %Y ] + ret void +; CHECK: DIVERGENT: %div.merge.exit = +} + +; divergent loop (G
, L) inside divergent loop (H
, B, C, D, G, L) +; the inner loop has no exit to top level. +; the outer loop becomes divergent as its exiting branch in C is control-dependent on the inner loop's divergent loop exit in D. +define amdgpu_kernel void @hidden_nestedloop_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_nestedloop_diverge': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %uni.cond, label %X, label %H + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %D ] + br i1 %uni.cond, label %G, label %B +; CHECK: DIVERGENT: %div.exitx = + +B: + br i1 %uni.cond, label %X, label %C + +C: + br i1 %uni.cond, label %Y, label %D + +D: + %uni.inc = add i32 %uni.merge.h, 1 + br label %H + +G: + br i1 %div.exitx, label %C, label %L +; CHECK: DIVERGENT: br i1 %div.exitx, + +L: + br i1 %uni.cond, label %D, label %G + +X: + %div.merge.x = phi i32 [ %a, %entry ], [ %uni.merge.h, %B ] ; temporal divergent phi + br i1 %uni.cond, label %Y, label %exit +; CHECK: DIVERGENT: %div.merge.x = + +Y: + %div.merge.y = phi i32 [ 42, %X ], [ %b, %C ] + br label %exit +; CHECK: DIVERGENT: %div.merge.y = + +exit: + %div.merge.exit = phi i32 [ %a, %X ], [ %b, %Y ] + ret void +; CHECK: DIVERGENT: %div.merge.exit = +} + +; divergent loop (G
, L) in divergent loop (H
, B, C, G, L) +; the outer loop has no immediately divergent exiting edge. +; the inner exiting edge is exiting to top-level through the outer loop causing both to become divergent. +define amdgpu_kernel void @hidden_doublebreak_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_doublebreak_diverge': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %uni.cond, label %X, label %H + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %C ] + br i1 %uni.cond, label %G, label %B +; CHECK: DIVERGENT: %div.exitx = + +B: + br i1 %uni.cond, label %Y, label %C + +C: + %uni.inc = add i32 %uni.merge.h, 1 + br label %H + +G: + br i1 %div.exitx, label %X, label %L ; two-level break +; CHECK: DIVERGENT: br i1 %div.exitx, + +L: + br i1 %uni.cond, label %C, label %G + +X: + %div.merge.x = phi i32 [ %a, %entry ], [ %uni.merge.h, %G ] ; temporal divergence + br label %Y +; CHECK: DIVERGENT: %div.merge.x = + +Y: + %div.merge.y = phi i32 [ 42, %X ], [ %b, %B ] + ret void +; CHECK: DIVERGENT: %div.merge.y = +} + +; divergent loop (G
, L) contained inside a uniform loop (H
, B, G, L , D) +define amdgpu_kernel void @hidden_containedloop_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_containedloop_diverge': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %uni.cond, label %X, label %H + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc.d, %D ] + br i1 %uni.cond, label %G, label %B +; CHECK: DIVERGENT: %div.exitx = + +B: + %div.merge.b = phi i32 [ 42, %H ], [ %uni.merge.g, %G ] + br label %D +; CHECK: DIVERGENT: %div.merge.b = + +G: + %uni.merge.g = phi i32 [ 123, %H ], [ %uni.inc.l, %L ] + br i1 %div.exitx, label %B, label %L +; CHECK: DIVERGENT: br i1 %div.exitx, + +L: + %uni.inc.l = add i32 %uni.merge.g, 1 + br i1 %uni.cond, label %G, label %D + +D: + %uni.inc.d = add i32 %uni.merge.h, 1 + br i1 %uni.cond, label %X, label %H + +X: + %uni.merge.x = phi i32 [ %a, %entry ], [ %uni.inc.d, %D ] + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } Index: test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll @@ -0,0 +1,13 @@ +; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +; CHECK: DIVERGENT: %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0 +define amdgpu_kernel void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) #0 { + %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0 + store i32 %swizzle, i32 addrspace(1)* %out, align 4 + ret void +} + +declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1 + +attributes #0 = { nounwind convergent } +attributes #1 = { nounwind readnone convergent } Index: test/Analysis/DivergenceAnalysis/AMDGPU/irreducible.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/irreducible.ll @@ -0,0 +1,48 @@ +; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +; This test contains an unstructured loop. +; +-------------- entry ----------------+ +; | | +; V V +; i1 = phi(0, i3) i2 = phi(0, i3) +; j1 = i1 + 1 ---> i3 = phi(j1, j2) <--- j2 = i2 + 2 +; ^ | ^ +; | V | +; +-------- switch (tid / i3) ----------+ +; | +; V +; if (i3 == 5) // divergent +; because sync dependent on (tid / i3). +define i32 @unstructured_loop(i1 %entry_cond) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'unstructured_loop' +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + br i1 %entry_cond, label %loop_entry_1, label %loop_entry_2 +loop_entry_1: + %i1 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ] + %j1 = add i32 %i1, 1 + br label %loop_body +loop_entry_2: + %i2 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ] + %j2 = add i32 %i2, 2 + br label %loop_body +loop_body: + %i3 = phi i32 [ %j1, %loop_entry_1 ], [ %j2, %loop_entry_2 ] + br label %loop_latch +loop_latch: + %div = sdiv i32 %tid, %i3 + switch i32 %div, label %branch [ i32 1, label %loop_entry_1 + i32 2, label %loop_entry_2 ] +branch: + %cmp = icmp eq i32 %i3, 5 + br i1 %cmp, label %then, label %else +; CHECK: DIVERGENT: br i1 %cmp, +then: + ret i32 0 +else: + ret i32 1 +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } Index: test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll @@ -0,0 +1,41 @@ +; RUN: opt %s -mtriple amdgcn-- -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s + +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'test_amdgpu_ps': +; CHECK: DIVERGENT: +; CHECK-NOT: %arg0 +; CHECK-NOT: %arg1 +; CHECK-NOT: %arg2 +; CHECK: <2 x i32> %arg3 +; CHECK: DIVERGENT: <3 x i32> %arg4 +; CHECK: DIVERGENT: float %arg5 +; CHECK: DIVERGENT: i32 %arg6 + +define amdgpu_ps void @test_amdgpu_ps([4 x <16 x i8>] addrspace(2)* byval %arg0, float inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <3 x i32> %arg4, float %arg5, i32 %arg6) #0 { + ret void +} + +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'test_amdgpu_kernel': +; CHECK-NOT: %arg0 +; CHECK-NOT: %arg1 +; CHECK-NOT: %arg2 +; CHECK-NOT: %arg3 +; CHECK-NOT: %arg4 +; CHECK-NOT: %arg5 +; CHECK-NOT: %arg6 +define amdgpu_kernel void @test_amdgpu_kernel([4 x <16 x i8>] addrspace(2)* byval %arg0, float inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <3 x i32> %arg4, float %arg5, i32 %arg6) #0 { + ret void +} + +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'test_c': +; CHECK: DIVERGENT: +; CHECK: DIVERGENT: +; CHECK: DIVERGENT: +; CHECK: DIVERGENT: +; CHECK: DIVERGENT: +; CHECK: DIVERGENT: +; CHECK: DIVERGENT: +define void @test_c([4 x <16 x i8>] addrspace(2)* byval %arg0, float inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <3 x i32> %arg4, float %arg5, i32 %arg6) #0 { + ret void +} + +attributes #0 = { nounwind } Index: test/Analysis/DivergenceAnalysis/AMDGPU/lit.local.cfg =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'AMDGPU' in config.root.targets: + config.unsupported = True Index: test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll @@ -0,0 +1,103 @@ +;RUN: opt -mtriple=amdgcn-mesa-mesa3d -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.swap( +define float @buffer_atomic_swap(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.add( +define float @buffer_atomic_add(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.add(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.sub( +define float @buffer_atomic_sub(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.sub(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.smin( +define float @buffer_atomic_smin(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.smin(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.umin( +define float @buffer_atomic_umin(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.umin(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.smax( +define float @buffer_atomic_smax(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.smax(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.umax( +define float @buffer_atomic_umax(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.umax(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.and( +define float @buffer_atomic_and(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.and(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.or( +define float @buffer_atomic_or(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.or(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.xor( +define float @buffer_atomic_xor(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.xor(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.cmpswap( +define float @buffer_atomic_cmpswap(<4 x i32> inreg %rsrc, i32 inreg %data, i32 inreg %cmp) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +declare i32 @llvm.amdgcn.buffer.atomic.swap(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.add(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.smin(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.umin(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.smax(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.umax(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.and(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.or(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.xor(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i1) #0 + +attributes #0 = { nounwind } Index: test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll @@ -0,0 +1,131 @@ +;RUN: opt -mtriple=amdgcn-mesa-mesa3d -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32( +define float @image_atomic_swap(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32( +define float @image_atomic_add(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32( +define float @image_atomic_sub(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32( +define float @image_atomic_smin(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32( +define float @image_atomic_umin(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32( +define float @image_atomic_smax(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32( +define float @image_atomic_umax(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32( +define float @image_atomic_and(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32( +define float @image_atomic_or(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32( +define float @image_atomic_xor(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32( +define float @image_atomic_inc(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32( +define float @image_atomic_dec(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32( +define float @image_atomic_cmpswap(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data, i32 inreg %cmp) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %data, i32 %cmp, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32( +define float @image_atomic_add_2d(<8 x i32> inreg %rsrc, i32 inreg %s, i32 inreg %t, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +declare i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #0 + +declare i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #0 + +attributes #0 = { nounwind } Index: test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll @@ -0,0 +1,30 @@ +; RUN: opt %s -mtriple amdgcn-- -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s + +; CHECK: DIVERGENT: %tmp5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp2 +; CHECK: DIVERGENT: %tmp10 = load volatile float, float addrspace(1)* %tmp5, align 4 +; CHECK: DIVERGENT: %tmp11 = load volatile float, float addrspace(1)* %tmp5, align 4 + +; The post dominator tree does not have a root node in this case +define amdgpu_kernel void @no_return_blocks(float addrspace(1)* noalias nocapture readonly %arg, float addrspace(1)* noalias nocapture readonly %arg1) #0 { +bb0: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %tmp2 = sext i32 %tmp to i64 + %tmp5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp2 + %tmp6 = load volatile float, float addrspace(1)* %tmp5, align 4 + %tmp8 = fcmp olt float %tmp6, 0.000000e+00 + br i1 %tmp8, label %bb1, label %bb2 + +bb1: + %tmp10 = load volatile float, float addrspace(1)* %tmp5, align 4 + br label %bb2 + +bb2: + %tmp11 = load volatile float, float addrspace(1)* %tmp5, align 4 + br label %bb1 +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } Index: test/Analysis/DivergenceAnalysis/AMDGPU/phi-undef.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/phi-undef.ll @@ -0,0 +1,31 @@ +; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +; CHECK-LABEL: 'test1': +; CHECK-NEXT: DIVERGENT: i32 %bound +; CHECK: {{^ *}}%counter = +; CHECK-NEXT: DIVERGENT: %break = icmp sge i32 %counter, %bound +; CHECK-NEXT: DIVERGENT: br i1 %break, label %footer, label %body +; CHECK: {{^ *}}%counter.next = +; CHECK: {{^ *}}%counter.footer = +; CHECK: DIVERGENT: br i1 %break, label %end, label %header +; Note: %counter is not divergent! +define amdgpu_ps void @test1(i32 %bound) { +entry: + br label %header + +header: + %counter = phi i32 [ 0, %entry ], [ %counter.footer, %footer ] + %break = icmp sge i32 %counter, %bound + br i1 %break, label %footer, label %body + +body: + %counter.next = add i32 %counter, 1 + br label %footer + +footer: + %counter.footer = phi i32 [ %counter.next, %body ], [ undef, %header ] + br i1 %break, label %end, label %header + +end: + ret void +} Index: test/Analysis/DivergenceAnalysis/AMDGPU/temporal_diverge.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/temporal_diverge.ll @@ -0,0 +1,154 @@ +; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +; temporal-divergent use of value carried by divergent loop +define amdgpu_kernel void @temporal_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'temporal_diverge': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br label %H + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %H ] + %uni.inc = add i32 %uni.merge.h, 1 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %div.exitx, label %X, label %H ; divergent branch +; CHECK: DIVERGENT: %div.exitx = +; CHECK: DIVERGENT: br i1 %div.exitx, + +X: + %div.user = add i32 %uni.inc, 5 + ret void +} + +; temporal-divergent use of value carried by divergent loop inside a top-level loop +define amdgpu_kernel void @temporal_diverge_inloop(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'temporal_diverge_inloop': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br label %G + +G: + br label %H + +H: + %uni.merge.h = phi i32 [ 0, %G ], [ %uni.inc, %H ] + %uni.inc = add i32 %uni.merge.h, 1 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %div.exitx, label %X, label %H ; divergent branch +; CHECK: DIVERGENT: %div.exitx = +; CHECK: DIVERGENT: br i1 %div.exitx, + +X: + %div.user = add i32 %uni.inc, 5 + br i1 %uni.cond, label %G, label %Y + +Y: + %div.alsouser = add i32 %uni.inc, 5 + ret void +} + + +; temporal-uniform use of a valud, definition and users are carried by a surrounding divergent loop +define amdgpu_kernel void @temporal_uniform_indivloop(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'temporal_uniform_indivloop': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br label %G + +G: + br label %H + +H: + %uni.merge.h = phi i32 [ 0, %G ], [ %uni.inc, %H ] + %uni.inc = add i32 %uni.merge.h, 1 + br i1 %uni.cond, label %X, label %H ; divergent branch + +X: + %uni.user = add i32 %uni.inc, 5 + %div.exity = icmp slt i32 %tid, 0 +; CHECK: DIVERGENT: %div.exity = + br i1 %div.exity, label %G, label %Y +; CHECK: DIVERGENT: br i1 %div.exity, + +Y: + %div.alsouser = add i32 %uni.inc, 5 + ret void +} + + +; temporal-divergent use of value carried by divergent loop, user is inside sibling loop +define amdgpu_kernel void @temporal_diverge_loopuser(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'temporal_diverge_loopuser': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br label %H + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %H ] + %uni.inc = add i32 %uni.merge.h, 1 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %div.exitx, label %X, label %H ; divergent branch +; CHECK: DIVERGENT: %div.exitx = +; CHECK: DIVERGENT: br i1 %div.exitx, + +X: + br label %G + +G: + %div.user = add i32 %uni.inc, 5 + br i1 %uni.cond, label %G, label %Y + +Y: + ret void +} + +; temporal-divergent use of value carried by divergent loop, user is inside sibling loop, defs and use are carried by a uniform loop +define amdgpu_kernel void @temporal_diverge_loopuser_nested(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'temporal_diverge_loopuser_nested': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br label %H + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %H ] + %uni.inc = add i32 %uni.merge.h, 1 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %div.exitx, label %X, label %H ; divergent branch +; CHECK: DIVERGENT: %div.exitx = +; CHECK: DIVERGENT: br i1 %div.exitx, + +X: + br label %G + +G: + %div.user = add i32 %uni.inc, 5 + br i1 %uni.cond, label %G, label %Y + +Y: + ret void +} + + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } Index: test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll @@ -0,0 +1,45 @@ +; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +declare i32 @llvm.amdgcn.workitem.id.x() #0 +declare i32 @llvm.amdgcn.workitem.id.y() #0 +declare i32 @llvm.amdgcn.workitem.id.z() #0 +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0 +declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0 + +; CHECK: DIVERGENT: %id.x = call i32 @llvm.amdgcn.workitem.id.x() +define amdgpu_kernel void @workitem_id_x() #1 { + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %id.x, i32 addrspace(1)* undef + ret void +} + +; CHECK: DIVERGENT: %id.y = call i32 @llvm.amdgcn.workitem.id.y() +define amdgpu_kernel void @workitem_id_y() #1 { + %id.y = call i32 @llvm.amdgcn.workitem.id.y() + store volatile i32 %id.y, i32 addrspace(1)* undef + ret void +} + +; CHECK: DIVERGENT: %id.z = call i32 @llvm.amdgcn.workitem.id.z() +define amdgpu_kernel void @workitem_id_z() #1 { + %id.z = call i32 @llvm.amdgcn.workitem.id.z() + store volatile i32 %id.z, i32 addrspace(1)* undef + ret void +} + +; CHECK: DIVERGENT: %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 0, i32 0) +define amdgpu_kernel void @mbcnt_lo() #1 { + %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 0, i32 0) + store volatile i32 %mbcnt.lo, i32 addrspace(1)* undef + ret void +} + +; CHECK: DIVERGENT: %mbcnt.hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0) +define amdgpu_kernel void @mbcnt_hi() #1 { + %mbcnt.hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0) + store volatile i32 %mbcnt.hi, i32 addrspace(1)* undef + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } Index: test/Analysis/DivergenceAnalysis/NVPTX/daorder.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/NVPTX/daorder.ll @@ -0,0 +1,47 @@ +; RUN: opt %s -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +define i32 @daorder(i32 %n) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'daorder' +entry: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %cond = icmp slt i32 %tid, 0 + br i1 %cond, label %A, label %B ; divergent +; CHECK: DIVERGENT: br i1 %cond, +A: + %defAtA = add i32 %n, 1 ; uniform +; CHECK-NOT: DIVERGENT: %defAtA = + br label %C +B: + %defAtB = add i32 %n, 2 ; uniform +; CHECK-NOT: DIVERGENT: %defAtB = + br label %C +C: + %defAtC = phi i32 [ %defAtA, %A ], [ %defAtB, %B ] ; divergent +; CHECK: DIVERGENT: %defAtC = + br label %D + +D: + %i = phi i32 [0, %C], [ %i.inc, %E ] ; uniform +; CHECK-NOT: DIVERGENT: %i = phi + br label %E + +E: + %i.inc = add i32 %i, 1 + %loopCnt = icmp slt i32 %i.inc, %n +; CHECK-NOT: DIVERGENT: %loopCnt = + br i1 %loopCnt, label %D, label %exit + +exit: + ret i32 %n +} + +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() +declare i32 @llvm.nvvm.read.ptx.sreg.laneid() + +!nvvm.annotations = !{!0} +!0 = !{i32 (i32)* @daorder, !"kernel", i32 1} Index: test/Analysis/DivergenceAnalysis/NVPTX/diverge.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/NVPTX/diverge.ll @@ -0,0 +1,175 @@ +; RUN: opt %s -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; return (n < 0 ? a + threadIdx.x : b + threadIdx.x) +define i32 @no_diverge(i32 %n, i32 %a, i32 %b) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'no_diverge' +entry: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %cond = icmp slt i32 %n, 0 + br i1 %cond, label %then, label %else ; uniform +; CHECK-NOT: DIVERGENT: br i1 %cond, +then: + %a1 = add i32 %a, %tid + br label %merge +else: + %b2 = add i32 %b, %tid + br label %merge +merge: + %c = phi i32 [ %a1, %then ], [ %b2, %else ] + ret i32 %c +} + +; c = a; +; if (threadIdx.x < 5) // divergent: data dependent +; c = b; +; return c; // c is divergent: sync dependent +define i32 @sync(i32 %a, i32 %b) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'sync' +bb1: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.y() + %cond = icmp slt i32 %tid, 5 + br i1 %cond, label %bb2, label %bb3 +; CHECK: DIVERGENT: br i1 %cond, +bb2: + br label %bb3 +bb3: + %c = phi i32 [ %a, %bb1 ], [ %b, %bb2 ] ; sync dependent on tid +; CHECK: DIVERGENT: %c = + ret i32 %c +} + +; c = 0; +; if (threadIdx.x >= 5) { // divergent +; c = (n < 0 ? a : b); // c here is uniform because n is uniform +; } +; // c here is divergent because it is sync dependent on threadIdx.x >= 5 +; return c; +define i32 @mixed(i32 %n, i32 %a, i32 %b) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'mixed' +bb1: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.z() + %cond = icmp slt i32 %tid, 5 + br i1 %cond, label %bb6, label %bb2 +; CHECK: DIVERGENT: br i1 %cond, +bb2: + %cond2 = icmp slt i32 %n, 0 + br i1 %cond2, label %bb4, label %bb3 +bb3: + br label %bb5 +bb4: + br label %bb5 +bb5: + %c = phi i32 [ %a, %bb3 ], [ %b, %bb4 ] +; CHECK-NOT: DIVERGENT: %c = + br label %bb6 +bb6: + %c2 = phi i32 [ 0, %bb1], [ %c, %bb5 ] +; CHECK: DIVERGENT: %c2 = + ret i32 %c2 +} + +; We conservatively treats all parameters of a __device__ function as divergent. +define i32 @device(i32 %n, i32 %a, i32 %b) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'device' +; CHECK: DIVERGENT: i32 %n +; CHECK: DIVERGENT: i32 %a +; CHECK: DIVERGENT: i32 %b +entry: + %cond = icmp slt i32 %n, 0 + br i1 %cond, label %then, label %else +; CHECK: DIVERGENT: br i1 %cond, +then: + br label %merge +else: + br label %merge +merge: + %c = phi i32 [ %a, %then ], [ %b, %else ] + ret i32 %c +} + +; int i = 0; +; do { +; i++; // i here is uniform +; } while (i < laneid); +; return i == 10 ? 0 : 1; // i here is divergent +; +; The i defined in the loop is used outside. +define i32 @loop() { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'loop' +entry: + %laneid = call i32 @llvm.nvvm.read.ptx.sreg.laneid() + br label %loop +loop: + %i = phi i32 [ 0, %entry ], [ %i1, %loop ] +; CHECK-NOT: DIVERGENT: %i = + %i1 = add i32 %i, 1 + %exit_cond = icmp sge i32 %i1, %laneid + br i1 %exit_cond, label %loop_exit, label %loop +loop_exit: + %cond = icmp eq i32 %i, 10 + br i1 %cond, label %then, label %else +; CHECK: DIVERGENT: br i1 %cond, +then: + ret i32 0 +else: + ret i32 1 +} + +; Same as @loop, but the loop is in the LCSSA form. +define i32 @lcssa() { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'lcssa' +entry: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + br label %loop +loop: + %i = phi i32 [ 0, %entry ], [ %i1, %loop ] +; CHECK-NOT: DIVERGENT: %i = + %i1 = add i32 %i, 1 + %exit_cond = icmp sge i32 %i1, %tid + br i1 %exit_cond, label %loop_exit, label %loop +loop_exit: + %i.lcssa = phi i32 [ %i, %loop ] +; CHECK: DIVERGENT: %i.lcssa = + %cond = icmp eq i32 %i.lcssa, 10 + br i1 %cond, label %then, label %else +; CHECK: DIVERGENT: br i1 %cond, +then: + ret i32 0 +else: + ret i32 1 +} + +; Verifies sync-dependence is computed correctly in the absense of loops. +define i32 @sync_no_loop(i32 %arg) { +entry: + %0 = add i32 %arg, 1 + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %1 = icmp sge i32 %tid, 10 + br i1 %1, label %bb1, label %bb2 + +bb1: + br label %bb3 + +bb2: + br label %bb3 + +bb3: + %2 = add i32 %0, 2 + ; CHECK-NOT: DIVERGENT: %2 + ret i32 %2 +} + +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() +declare i32 @llvm.nvvm.read.ptx.sreg.laneid() + +!nvvm.annotations = !{!0, !1, !2, !3, !4} +!0 = !{i32 (i32, i32, i32)* @no_diverge, !"kernel", i32 1} +!1 = !{i32 (i32, i32)* @sync, !"kernel", i32 1} +!2 = !{i32 (i32, i32, i32)* @mixed, !"kernel", i32 1} +!3 = !{i32 ()* @loop, !"kernel", i32 1} +!4 = !{i32 (i32)* @sync_no_loop, !"kernel", i32 1} Index: test/Analysis/DivergenceAnalysis/NVPTX/hidden_diverge.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/NVPTX/hidden_diverge.ll @@ -0,0 +1,30 @@ +; RUN: opt %s -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +define i32 @hidden_diverge(i32 %n, i32 %a, i32 %b) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_diverge' +entry: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %cond.var = icmp slt i32 %tid, 0 + br i1 %cond.var, label %B, label %C ; divergent +; CHECK: DIVERGENT: br i1 %cond.var, +B: + %cond.uni = icmp slt i32 %n, 0 + br i1 %cond.uni, label %C, label %merge ; uniform +; CHECK-NOT: DIVERGENT: br i1 %cond.uni, +C: + %phi.var.hidden = phi i32 [ 1, %entry ], [ 2, %B ] +; CHECK: DIVERGENT: %phi.var.hidden = phi i32 + br label %merge +merge: + %phi.ipd = phi i32 [ %a, %B ], [ %b, %C ] +; CHECK: DIVERGENT: %phi.ipd = phi i32 + ret i32 %phi.ipd +} + +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() + +!nvvm.annotations = !{!0} +!0 = !{i32 (i32, i32, i32)* @hidden_diverge, !"kernel", i32 1} Index: test/Analysis/DivergenceAnalysis/NVPTX/irreducible.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/NVPTX/irreducible.ll @@ -0,0 +1,55 @@ +; RUN: opt %s -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; This test contains an unstructured loop. +; +-------------- entry ----------------+ +; | | +; V V +; i1 = phi(0, i3) i2 = phi(0, i3) +; j1 = i1 + 1 ---> i3 = phi(j1, j2) <--- j2 = i2 + 2 +; ^ | ^ +; | V | +; +-------- switch (tid / i3) ----------+ +; | +; V +; if (i3 == 5) // divergent +; because sync dependent on (tid / i3). +define i32 @unstructured_loop(i1 %entry_cond) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'unstructured_loop' +entry: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + br i1 %entry_cond, label %loop_entry_1, label %loop_entry_2 +loop_entry_1: + %i1 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ] + %j1 = add i32 %i1, 1 + br label %loop_body +loop_entry_2: + %i2 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ] + %j2 = add i32 %i2, 2 + br label %loop_body +loop_body: + %i3 = phi i32 [ %j1, %loop_entry_1 ], [ %j2, %loop_entry_2 ] + br label %loop_latch +loop_latch: + %div = sdiv i32 %tid, %i3 + switch i32 %div, label %branch [ i32 1, label %loop_entry_1 + i32 2, label %loop_entry_2 ] +branch: + %cmp = icmp eq i32 %i3, 5 + br i1 %cmp, label %then, label %else +; CHECK: DIVERGENT: br i1 %cmp, +then: + ret i32 0 +else: + ret i32 1 +} + +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() +declare i32 @llvm.nvvm.read.ptx.sreg.laneid() + +!nvvm.annotations = !{!0} +!0 = !{i32 (i1)* @unstructured_loop, !"kernel", i32 1} Index: test/Analysis/DivergenceAnalysis/NVPTX/lit.local.cfg =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/NVPTX/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'NVPTX' in config.root.targets: + config.unsupported = True