Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -705,6 +705,9 @@ def int_amdgcn_end_cf : Intrinsic<[], [llvm_i64_ty], [IntrConvergent]>; +// Represent unreachable in a divergent region. +def int_amdgcn_unreachable : Intrinsic<[], [], [IntrConvergent]>; + // Emit 2.5 ulp, no denormal division. Should only be inserted by // pass based on !fpmath metadata. def int_amdgcn_fdiv_fast : Intrinsic< Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -119,6 +119,9 @@ void initializeSIInsertWaitsPass(PassRegistry&); extern char &SIInsertWaitsID; +void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry&); +extern char &AMDGPUUnifyDivergentExitNodesID; + Target &getTheAMDGPUTarget(); Target &getTheGCNTarget(); Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -119,6 +119,7 @@ initializeSIInsertSkipsPass(*PR); initializeSIDebuggerInsertNopsPass(*PR); initializeSIOptimizeExecMaskingPass(*PR); + initializeAMDGPUUnifyDivergentExitNodesPass(*PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -600,6 +601,10 @@ // supported. const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); addPass(createAMDGPUAnnotateKernelFeaturesPass(&TM)); + + // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit + // regions formed by them. + addPass(&AMDGPUUnifyDivergentExitNodesID); addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions addPass(createSinkingPass()); addPass(createSITypeRewriter()); Index: lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -0,0 +1,225 @@ +//===- AMDGPUUnifyDivergentExitNodes.cpp ----------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This is a variant of the UnifyDivergentExitNodes pass. Rather than ensuring +// there is at most one ret and one unreachable instruction, it ensures there is +// at most one divergent exiting block. +// +// StructurizeCFG can't deal with multi-exit regions formed by branches to +// multiple return nodes. It is not desirable to structurize regions with +// uniform branches, so unifying those to the same return block as divergent +// branches inhibits use of scalar branching. It still can't deal with the case +// where one branch goes to return, and one unreachable. Replace unreachable in +// this case with a return. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Type.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-unify-divergent-exit-nodes" + +namespace { + +class AMDGPUUnifyDivergentExitNodes : public FunctionPass { +public: + static char ID; // Pass identification, replacement for typeid + AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) { + initializeAMDGPUUnifyDivergentExitNodesPass(*PassRegistry::getPassRegistry()); + } + + // We can preserve non-critical-edgeness when we unify function exit nodes + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnFunction(Function &F) override; +}; + +} + +char AMDGPUUnifyDivergentExitNodes::ID = 0; +INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE, + "Unify divergent function exit nodes", false, false) +INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE, + "Unify divergent function exit nodes", false, false) + +char &llvm::AMDGPUUnifyDivergentExitNodesID = AMDGPUUnifyDivergentExitNodes::ID; + +void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{ + // TODO: Preserve dominator tree. + AU.addRequired(); + + AU.addRequired(); + + // No divergent values are changed, only blocks and branch edges. + AU.addPreserved(); + + // We preserve the non-critical-edgeness property + AU.addPreservedID(BreakCriticalEdgesID); + + // This is a cluster of orthogonal Transforms + AU.addPreservedID(LowerSwitchID); + FunctionPass::getAnalysisUsage(AU); + + AU.addRequired(); +} + +/// \returns true if \p BB is reachable through only uniform branches. +/// XXX - Is there a more efficient way to find this? +static bool isUniformlyReached(const DivergenceAnalysis &DA, + BasicBlock &BB) { + SmallVector Stack; + SmallPtrSet Visited; + + for (BasicBlock *Pred : predecessors(&BB)) + Stack.push_back(Pred); + + while (!Stack.empty()) { + BasicBlock *Top = Stack.pop_back_val(); + if (!DA.isUniform(Top->getTerminator())) + return false; + + for (BasicBlock *Pred : predecessors(Top)) { + if (Visited.insert(Pred).second) + Stack.push_back(Pred); + } + } + + return true; +} + +static BasicBlock *unifyReturnBlockSet(Function &F, + ArrayRef ReturningBlocks, + const TargetTransformInfo &TTI, + StringRef Name) { + // Otherwise, we need to insert a new basic block into the function, add a PHI + // nodes (if the function returns values), and convert all of the return + // instructions into unconditional branches. + // + BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F); + + PHINode *PN = nullptr; + if (F.getReturnType()->isVoidTy()) { + ReturnInst::Create(F.getContext(), nullptr, NewRetBlock); + } else { + // If the function doesn't return void... add a PHI node to the block... + PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(), + "UnifiedRetVal"); + NewRetBlock->getInstList().push_back(PN); + ReturnInst::Create(F.getContext(), PN, NewRetBlock); + } + + // Loop over all of the blocks, replacing the return instruction with an + // unconditional branch. + // + for (BasicBlock *BB : ReturningBlocks) { + // Add an incoming element to the PHI node for every return instruction that + // is merging into this new block... + if (PN) + PN->addIncoming(BB->getTerminator()->getOperand(0), BB); + + BB->getInstList().pop_back(); // Remove the return insn + BranchInst::Create(NewRetBlock, BB); + } + + for (BasicBlock *BB : ReturningBlocks) { + // Cleanup possible branch to unconditional branch to the return. + SimplifyCFG(BB, TTI, 2); + } + + return NewRetBlock; +} + +bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { + auto &PDT = getAnalysis().getPostDomTree(); + if (PDT.getRoots().size() <= 1) + return false; + + DivergenceAnalysis &DA = getAnalysis(); + + // Loop over all of the blocks in a function, tracking all of the blocks that + // return. + // + SmallVector ReturningBlocks; + SmallVector UnreachableBlocks; + + for (BasicBlock *BB : PDT.getRoots()) { + if (isa(BB->getTerminator())) { + if (!isUniformlyReached(DA, *BB)) + ReturningBlocks.push_back(BB); + } else if (isa(BB->getTerminator())) { + if (!isUniformlyReached(DA, *BB)) + UnreachableBlocks.push_back(BB); + } + } + + if (!UnreachableBlocks.empty()) { + BasicBlock *UnreachableBlock = nullptr; + + if (UnreachableBlocks.size() == 1) { + UnreachableBlock = UnreachableBlocks.front(); + } else { + UnreachableBlock = BasicBlock::Create(F.getContext(), + "UnifiedUnreachableBlock", &F); + new UnreachableInst(F.getContext(), UnreachableBlock); + + for (BasicBlock *BB : UnreachableBlocks) { + BB->getInstList().pop_back(); // Remove the unreachable inst. + BranchInst::Create(UnreachableBlock, BB); + } + } + + if (!ReturningBlocks.empty()) { + // Don't create a new unreachable inst if we have a return. The + // structurizer/annotator can't handle the multiple exits + + Type *RetTy = F.getReturnType(); + Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy); + UnreachableBlock->getInstList().pop_back(); // Remove the unreachable inst. + + Function *UnreachableIntrin = + Intrinsic::getDeclaration(F.getParent(), Intrinsic::amdgcn_unreachable); + + // Insert a call to an intrinsic tracking that this is an unreachable + // point, in case we want to kill the active lanes or something later. + CallInst::Create(UnreachableIntrin, {}, "", UnreachableBlock); + + // Don't create a scalar trap. We would only want to trap if this code was + // really reached, but a scalar trap would happen even if no lanes + // actually reached here. + ReturnInst::Create(F.getContext(), RetVal, UnreachableBlock); + ReturningBlocks.push_back(UnreachableBlock); + } + } + + // Now handle return blocks. + if (ReturningBlocks.empty()) + return false; // No blocks return + + if (ReturningBlocks.size() == 1) + return false; // Already has a single return block + + const TargetTransformInfo &TTI + = getAnalysis().getTTI(F); + + unifyReturnBlockSet(F, ReturningBlocks, TTI, "UnifiedReturnBlock"); + return true; +} Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -57,6 +57,7 @@ AMDGPUInstrInfo.cpp AMDGPUPromoteAlloca.cpp AMDGPURegisterInfo.cpp + AMDGPUUnifyDivergentExitNodes.cpp GCNHazardRecognizer.cpp GCNSchedStrategy.cpp R600ClauseMergePass.cpp Index: lib/Target/AMDGPU/SIInstrFormats.td =================================================================== --- lib/Target/AMDGPU/SIInstrFormats.td +++ lib/Target/AMDGPU/SIInstrFormats.td @@ -138,19 +138,19 @@ let AsmVariantName = AMDGPUAsmVariants.Default; } -class PseudoInstSI pattern = []> - : InstSI { +class PseudoInstSI pattern = [], string asm = ""> + : InstSI { let isPseudo = 1; let isCodeGenOnly = 1; } -class SPseudoInstSI pattern = []> - : PseudoInstSI { +class SPseudoInstSI pattern = [], string asm = ""> + : PseudoInstSI { let SALU = 1; } -class VPseudoInstSI pattern = []> - : PseudoInstSI { +class VPseudoInstSI pattern = [], string asm = ""> + : PseudoInstSI { let VALU = 1; let Uses = [EXEC]; } Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3797,16 +3797,11 @@ if (DescSize != 0 && DescSize != 4) return DescSize; - if (Opc == AMDGPU::WAVE_BARRIER) - return 0; - // 4-byte instructions may have a 32-bit literal encoded after them. Check // operands that coud ever be literals. if (isVALU(MI) || isSALU(MI)) { - if (isFixedSize(MI)) { - assert(DescSize == 4); + if (isFixedSize(MI)) return DescSize; - } int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); if (Src0Idx == -1) @@ -3829,7 +3824,6 @@ return 4; switch (Opc) { - case AMDGPU::SI_MASK_BRANCH: case TargetOpcode::IMPLICIT_DEF: case TargetOpcode::KILL: case TargetOpcode::DBG_VALUE: Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -152,6 +152,8 @@ let mayStore = 1; let isBarrier = 1; let isConvergent = 1; + let FixedSize = 1; + let Size = 0; } // SI pseudo instructions. These are used by the CFG structurizer pass @@ -159,14 +161,15 @@ // Dummy terminator instruction to use after control flow instructions // replaced with exec mask operations. -def SI_MASK_BRANCH : PseudoInstSI < +def SI_MASK_BRANCH : VPseudoInstSI < (outs), (ins brtarget:$target)> { let isBranch = 0; let isTerminator = 1; let isBarrier = 0; - let Uses = [EXEC]; let SchedRW = []; let hasNoSchedulingInfo = 1; + let FixedSize = 1; + let Size = 0; } let isTerminator = 1 in { @@ -260,6 +263,14 @@ let SALU = 1; } +def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins), + [(int_amdgcn_unreachable)], + "; divergent unreachable"> { + let Size = 0; + let hasNoSchedulingInfo = 1; + let FixedSize = 1; +} + // Used as an isel pseudo to directly emit initialization with an // s_mov_b32 rather than a copy of another initialized // register. MachineCSE skips copies, and we don't want to have to Index: test/CodeGen/AMDGPU/branch-condition-and.ll =================================================================== --- test/CodeGen/AMDGPU/branch-condition-and.ll +++ test/CodeGen/AMDGPU/branch-condition-and.ll @@ -15,12 +15,16 @@ ; GCN: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[OTHERCC]] ; GCN: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[AND]] ; GCN: s_xor_b64 {{s\[[0-9]+:[0-9]+\]}}, exec, [[SAVED]] -; -; TODO: The following sequence is a bug (missing s_endpgm)! -; -; GCN: s_branch [[BB:BB[0-9]+_[0-9]+]] -; GCN: [[BB]]: -; GCN-NEXT: .Lfunc_end0: +; GCN: ; mask branch [[BB5:BB[0-9]+_[0-9]+]] + +; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %bb4 +; GCN: ds_write_b32 +; GCN: s_waitcnt + +; GCN-NEXT: [[BB5]] +; GCN: s_or_b64 exec, exec +; GCN-NEXT: s_endpgm +; GCN-NEXT: .Lfunc_end define amdgpu_ps void @ham(float %arg, float %arg1) #0 { bb: %tmp = fcmp ogt float %arg, 0.000000e+00 @@ -29,6 +33,7 @@ br i1 %tmp3, label %bb4, label %bb5 bb4: ; preds = %bb + store volatile i32 4, i32 addrspace(3)* undef unreachable bb5: ; preds = %bb Index: test/CodeGen/AMDGPU/multi-divergent-exit-region.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/multi-divergent-exit-region.ll @@ -0,0 +1,710 @@ +; RUN: opt -mtriple=amdgcn-- -S -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow %s | FileCheck -check-prefix=IR %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; Add an extra verifier runs. There were some cases where invalid IR +; was produced but happened to be fixed by the later passes. + +; Make sure divergent control flow with multiple exits from a region +; is properly handled. UnifyFunctionExitNodes should be run before +; StructurizeCFG. + +; IR-LABEL: @multi_divergent_region_exit_ret_ret( +; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0) +; IR: %2 = extractvalue { i1, i64 } %1, 0 +; IR: %3 = extractvalue { i1, i64 } %1, 1 +; IR: br i1 %2, label %LeafBlock1, label %Flow + +; IR: Flow: +; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] +; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ] +; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3) +; IR: %7 = extractvalue { i1, i64 } %6, 0 +; IR: %8 = extractvalue { i1, i64 } %6, 1 +; IR: br i1 %7, label %LeafBlock, label %Flow1 + +; IR: LeafBlock: +; IR: br label %Flow1 + +; IR: LeafBlock1: +; IR: br label %Flow{{$}} + +; IR: Flow2: +; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %19) +; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11) +; IR: %13 = extractvalue { i1, i64 } %12, 0 +; IR: %14 = extractvalue { i1, i64 } %12, 1 +; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock + +; IR: exit0: +; IR: store volatile i32 9, i32 addrspace(1)* undef +; IR: br label %UnifiedReturnBlock + +; IR: Flow1: +; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ] +; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ] +; IR: call void @llvm.amdgcn.end.cf(i64 %8) +; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16) +; IR: %18 = extractvalue { i1, i64 } %17, 0 +; IR: %19 = extractvalue { i1, i64 } %17, 1 +; IR: br i1 %18, label %exit1, label %Flow2 + +; IR: exit1: +; IR: store volatile i32 17, i32 addrspace(3)* undef +; IR: br label %Flow2 + +; IR: UnifiedReturnBlock: +; IR: call void @llvm.amdgcn.end.cf(i64 %14) +; IR: ret void + + +; GCN-LABEL: {{^}}multi_divergent_region_exit_ret_ret: +; GCN: v_cmp_lt_i32_e32 vcc, 1 +; GCN: s_and_saveexec_b64 +; GCN: s_xor_b64 + + +; FIXME: Why is this compare essentially repeated? +; GCN: v_cmp_eq_u32_e32 vcc, 1, [[REG:v[0-9]+]] +; GCN-NEXT: v_cmp_ne_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, [[REG]] +; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc +; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1 + +; GCN: ; %Flow1 +; GCN-NEXT: s_or_b64 exec, exec +; GCN: v_cmp_ne_u32_e32 vcc, 0 + +; GCN: ; %exit1 +; GCN: ds_write_b32 + +; GCN: %Flow2 +; GCN-NEXT: s_or_b64 exec, exec +; GCN: v_cmp_ne_u32_e32 vcc, 0 +; GCN-NEXT: s_and_saveexec_b64 +; GCN-NEXT: s_xor_b64 + +; GCN: ; %exit0 +; GCN: buffer_store_dword + +; GCN: ; %UnifiedReturnBlock +; GCN-NEXT: s_or_b64 exec, exec +; GCN-NEXT: s_endpgm +define amdgpu_kernel void @multi_divergent_region_exit_ret_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 { +entry: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %tmp1 = add i32 0, %tmp + %tmp2 = zext i32 %tmp1 to i64 + %tmp3 = add i64 0, %tmp2 + %tmp4 = shl i64 %tmp3, 32 + %tmp5 = ashr exact i64 %tmp4, 32 + %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5 + %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4 + %tmp8 = sext i32 %tmp7 to i64 + %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8 + %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4 + %tmp13 = zext i32 %tmp10 to i64 + %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13 + %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16 + %Pivot = icmp slt i32 %tmp16, 2 + br i1 %Pivot, label %LeafBlock, label %LeafBlock1 + +LeafBlock: ; preds = %entry + %SwitchLeaf = icmp eq i32 %tmp16, 1 + br i1 %SwitchLeaf, label %exit0, label %exit1 + +LeafBlock1: ; preds = %entry + %SwitchLeaf2 = icmp eq i32 %tmp16, 2 + br i1 %SwitchLeaf2, label %exit0, label %exit1 + +exit0: ; preds = %LeafBlock, %LeafBlock1 + store volatile i32 9, i32 addrspace(1)* undef + ret void + +exit1: ; preds = %LeafBlock, %LeafBlock1 + store volatile i32 17, i32 addrspace(3)* undef + ret void +} + +; IR-LABEL: @multi_divergent_region_exit_unreachable_unreachable( +; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0) + +; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3) + +; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %19) +; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11) +; IR: br i1 %13, label %exit0, label %UnifiedUnreachableBlock + + +; IR: UnifiedUnreachableBlock: +; IR-NEXT: unreachable + + +; FIXME: Probably should insert an s_endpgm anyway. +; GCN-LABEL: {{^}}multi_divergent_region_exit_unreachable_unreachable: +; GCN: ; %UnifiedUnreachableBlock +; GCN-NEXT: .Lfunc_end +define amdgpu_kernel void @multi_divergent_region_exit_unreachable_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 { +entry: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %tmp1 = add i32 0, %tmp + %tmp2 = zext i32 %tmp1 to i64 + %tmp3 = add i64 0, %tmp2 + %tmp4 = shl i64 %tmp3, 32 + %tmp5 = ashr exact i64 %tmp4, 32 + %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5 + %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4 + %tmp8 = sext i32 %tmp7 to i64 + %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8 + %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4 + %tmp13 = zext i32 %tmp10 to i64 + %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13 + %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16 + %Pivot = icmp slt i32 %tmp16, 2 + br i1 %Pivot, label %LeafBlock, label %LeafBlock1 + +LeafBlock: ; preds = %entry + %SwitchLeaf = icmp eq i32 %tmp16, 1 + br i1 %SwitchLeaf, label %exit0, label %exit1 + +LeafBlock1: ; preds = %entry + %SwitchLeaf2 = icmp eq i32 %tmp16, 2 + br i1 %SwitchLeaf2, label %exit0, label %exit1 + +exit0: ; preds = %LeafBlock, %LeafBlock1 + store volatile i32 9, i32 addrspace(1)* undef + unreachable + +exit1: ; preds = %LeafBlock, %LeafBlock1 + store volatile i32 17, i32 addrspace(3)* undef + unreachable +} + +; IR-LABEL: @multi_exit_region_divergent_ret_uniform_ret( +; IR: %divergent.cond0 = icmp slt i32 %tmp16, 2 +; IR: llvm.amdgcn.if +; IR: br i1 + +; IR: {{^}}Flow: +; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] +; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ] +; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3) +; IR: br i1 %7, label %LeafBlock, label %Flow1 + +; IR: {{^}}LeafBlock: +; IR: %divergent.cond1 = icmp eq i32 %tmp16, 1 +; IR: %9 = xor i1 %divergent.cond1, true +; IR: br label %Flow1 + +; IR: LeafBlock1: +; IR: %uniform.cond0 = icmp eq i32 %arg3, 2 +; IR: %10 = xor i1 %uniform.cond0, true +; IR: br label %Flow + +; IR: Flow2: +; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %19) +; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11) +; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock + +; IR: exit0: +; IR: store volatile i32 9, i32 addrspace(1)* undef +; IR: br label %UnifiedReturnBlock + +; IR: {{^}}Flow1: +; IR: %15 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %4, %Flow ] +; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ] +; IR: call void @llvm.amdgcn.end.cf(i64 %8) +; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16) +; IR: %18 = extractvalue { i1, i64 } %17, 0 +; IR: %19 = extractvalue { i1, i64 } %17, 1 +; IR: br i1 %18, label %exit1, label %Flow2 + +; IR: exit1: +; IR: store volatile i32 17, i32 addrspace(3)* undef +; IR: br label %Flow2 + +; IR: UnifiedReturnBlock: +; IR: call void @llvm.amdgcn.end.cf(i64 %14) +; IR: ret void +define amdgpu_kernel void @multi_exit_region_divergent_ret_uniform_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 { +entry: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %tmp1 = add i32 0, %tmp + %tmp2 = zext i32 %tmp1 to i64 + %tmp3 = add i64 0, %tmp2 + %tmp4 = shl i64 %tmp3, 32 + %tmp5 = ashr exact i64 %tmp4, 32 + %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5 + %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4 + %tmp8 = sext i32 %tmp7 to i64 + %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8 + %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4 + %tmp13 = zext i32 %tmp10 to i64 + %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13 + %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16 + %divergent.cond0 = icmp slt i32 %tmp16, 2 + br i1 %divergent.cond0, label %LeafBlock, label %LeafBlock1 + +LeafBlock: ; preds = %entry + %divergent.cond1 = icmp eq i32 %tmp16, 1 + br i1 %divergent.cond1, label %exit0, label %exit1 + +LeafBlock1: ; preds = %entry + %uniform.cond0 = icmp eq i32 %arg3, 2 + br i1 %uniform.cond0, label %exit0, label %exit1 + +exit0: ; preds = %LeafBlock, %LeafBlock1 + store volatile i32 9, i32 addrspace(1)* undef + ret void + +exit1: ; preds = %LeafBlock, %LeafBlock1 + store volatile i32 17, i32 addrspace(3)* undef + ret void +} + +; IR-LABEL: @multi_exit_region_uniform_ret_divergent_ret( +; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0) +; IR: br i1 %2, label %LeafBlock1, label %Flow + +; IR: Flow: +; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] +; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ] +; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3) + +; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %19) +; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11) + +define amdgpu_kernel void @multi_exit_region_uniform_ret_divergent_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 { +entry: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %tmp1 = add i32 0, %tmp + %tmp2 = zext i32 %tmp1 to i64 + %tmp3 = add i64 0, %tmp2 + %tmp4 = shl i64 %tmp3, 32 + %tmp5 = ashr exact i64 %tmp4, 32 + %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5 + %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4 + %tmp8 = sext i32 %tmp7 to i64 + %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8 + %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4 + %tmp13 = zext i32 %tmp10 to i64 + %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13 + %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16 + %Pivot = icmp slt i32 %tmp16, 2 + br i1 %Pivot, label %LeafBlock, label %LeafBlock1 + +LeafBlock: ; preds = %entry + %SwitchLeaf = icmp eq i32 %arg3, 1 + br i1 %SwitchLeaf, label %exit0, label %exit1 + +LeafBlock1: ; preds = %entry + %SwitchLeaf2 = icmp eq i32 %tmp16, 2 + br i1 %SwitchLeaf2, label %exit0, label %exit1 + +exit0: ; preds = %LeafBlock, %LeafBlock1 + store volatile i32 9, i32 addrspace(1)* undef + ret void + +exit1: ; preds = %LeafBlock, %LeafBlock1 + store volatile i32 17, i32 addrspace(3)* undef + ret void +} + +; IR-LABEL: @multi_divergent_region_exit_ret_ret_return_value( +; IR: Flow2: +; IR: %11 = phi float [ 2.000000e+00, %exit1 ], [ undef, %Flow1 ] +; IR: %12 = phi i1 [ false, %exit1 ], [ %16, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %20) + +; IR: UnifiedReturnBlock: +; IR: %UnifiedRetVal = phi float [ %11, %Flow2 ], [ 1.000000e+00, %exit0 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %15) +; IR: ret float %UnifiedRetVal +define amdgpu_ps float @multi_divergent_region_exit_ret_ret_return_value(i32 %vgpr) #0 { +entry: + %Pivot = icmp slt i32 %vgpr, 2 + br i1 %Pivot, label %LeafBlock, label %LeafBlock1 + +LeafBlock: ; preds = %entry + %SwitchLeaf = icmp eq i32 %vgpr, 1 + br i1 %SwitchLeaf, label %exit0, label %exit1 + +LeafBlock1: ; preds = %entry + %SwitchLeaf2 = icmp eq i32 %vgpr, 2 + br i1 %SwitchLeaf2, label %exit0, label %exit1 + +exit0: ; preds = %LeafBlock, %LeafBlock1 + store i32 9, i32 addrspace(1)* undef + ret float 1.0 + +exit1: ; preds = %LeafBlock, %LeafBlock1 + store i32 17, i32 addrspace(3)* undef + ret float 2.0 +} + +; IR-LABEL: @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value( + +; GCN-LABEL: {{^}}uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value: +; GCN: s_cmp_gt_i32 s0, 1 +; GCN: s_cbranch_scc0 [[FLOW:BB[0-9]+_[0-9]+]] + +; GCN: v_cmp_ne_u32_e32 vcc, 7, v0 + +; GCN: {{^}}[[FLOW]]: +; GCN: s_cbranch_vccnz [[FLOW1:BB[0-9]+]] + +; GCN: v_mov_b32_e32 v0, 2.0 +; GCN: s_or_b64 exec, exec +; GCN: s_and_b64 exec, exec +; GCN: v_mov_b32_e32 v0, 1.0 + +; GCN: {{^BB[0-9]+_[0-9]+}}: ; %UnifiedReturnBlock +; GCN-NEXT: s_or_b64 exec, exec +; GCN-NEXT: ; return + +define amdgpu_ps float @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(i32 inreg %sgpr, i32 %vgpr) #0 { +entry: + %uniform.cond = icmp slt i32 %sgpr, 2 + br i1 %uniform.cond, label %LeafBlock, label %LeafBlock1 + +LeafBlock: ; preds = %entry + %divergent.cond0 = icmp eq i32 %vgpr, 3 + br i1 %divergent.cond0, label %exit0, label %exit1 + +LeafBlock1: ; preds = %entry + %divergent.cond1 = icmp eq i32 %vgpr, 7 + br i1 %divergent.cond1, label %exit0, label %exit1 + +exit0: ; preds = %LeafBlock, %LeafBlock1 + store i32 9, i32 addrspace(1)* undef + ret float 1.0 + +exit1: ; preds = %LeafBlock, %LeafBlock1 + store i32 17, i32 addrspace(3)* undef + ret float 2.0 +} + +; IR-LABEL: @multi_divergent_region_exit_ret_unreachable( +; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0) + +; IR: Flow: +; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] +; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ] +; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3) + +; IR: Flow2: +; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %19) +; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11) +; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock + +; IR: exit0: +; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef +; IR-NEXT: br label %UnifiedReturnBlock + +; IR: Flow1: +; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ] +; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ] +; IR: call void @llvm.amdgcn.end.cf(i64 %8) +; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16) +; IR: %18 = extractvalue { i1, i64 } %17, 0 +; IR: %19 = extractvalue { i1, i64 } %17, 1 +; IR: br i1 %18, label %exit1, label %Flow2 + +; IR: exit1: +; IR-NEXT: store volatile i32 9, i32 addrspace(1)* undef +; IR-NEXT: call void @llvm.amdgcn.unreachable() +; IR-NEXT: br label %Flow2 + +; IR: UnifiedReturnBlock: +; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14) +; IR-NEXT: ret void +define amdgpu_kernel void @multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 { +entry: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %tmp1 = add i32 0, %tmp + %tmp2 = zext i32 %tmp1 to i64 + %tmp3 = add i64 0, %tmp2 + %tmp4 = shl i64 %tmp3, 32 + %tmp5 = ashr exact i64 %tmp4, 32 + %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5 + %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4 + %tmp8 = sext i32 %tmp7 to i64 + %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8 + %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4 + %tmp13 = zext i32 %tmp10 to i64 + %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13 + %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16 + %Pivot = icmp slt i32 %tmp16, 2 + br i1 %Pivot, label %LeafBlock, label %LeafBlock1 + +LeafBlock: ; preds = %entry + %SwitchLeaf = icmp eq i32 %tmp16, 1 + br i1 %SwitchLeaf, label %exit0, label %exit1 + +LeafBlock1: ; preds = %entry + %SwitchLeaf2 = icmp eq i32 %tmp16, 2 + br i1 %SwitchLeaf2, label %exit0, label %exit1 + +exit0: ; preds = %LeafBlock, %LeafBlock1 + store volatile i32 17, i32 addrspace(3)* undef + ret void + +exit1: ; preds = %LeafBlock, %LeafBlock1 + store volatile i32 9, i32 addrspace(1)* undef + unreachable +} + +; The non-uniformity of the branch to the exiting blocks requires +; looking at transitive predecessors. + +; IR-LABEL: @indirect_multi_divergent_region_exit_ret_unreachable( + +; IR: exit0: ; preds = %Flow2 +; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef +; IR-NEXT: br label %UnifiedReturnBlock + + +; IR: indirect.exit1: +; IR: %load = load volatile i32, i32 addrspace(1)* undef +; IR: store volatile i32 %load, i32 addrspace(1)* undef +; IR: store volatile i32 9, i32 addrspace(1)* undef +; IR: call void @llvm.amdgcn.unreachable() +; IR-NEXT: br label %Flow2 + +; IR: UnifiedReturnBlock: ; preds = %exit0, %Flow2 +; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14) +; IR-NEXT: ret void +define amdgpu_kernel void @indirect_multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 { +entry: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %tmp1 = add i32 0, %tmp + %tmp2 = zext i32 %tmp1 to i64 + %tmp3 = add i64 0, %tmp2 + %tmp4 = shl i64 %tmp3, 32 + %tmp5 = ashr exact i64 %tmp4, 32 + %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5 + %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4 + %tmp8 = sext i32 %tmp7 to i64 + %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8 + %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4 + %tmp13 = zext i32 %tmp10 to i64 + %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13 + %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16 + %Pivot = icmp slt i32 %tmp16, 2 + br i1 %Pivot, label %LeafBlock, label %LeafBlock1 + +LeafBlock: ; preds = %entry + %SwitchLeaf = icmp eq i32 %tmp16, 1 + br i1 %SwitchLeaf, label %exit0, label %indirect.exit1 + +LeafBlock1: ; preds = %entry + %SwitchLeaf2 = icmp eq i32 %tmp16, 2 + br i1 %SwitchLeaf2, label %exit0, label %indirect.exit1 + +exit0: ; preds = %LeafBlock, %LeafBlock1 + store volatile i32 17, i32 addrspace(3)* undef + ret void + +indirect.exit1: + %load = load volatile i32, i32 addrspace(1)* undef + store volatile i32 %load, i32 addrspace(1)* undef + br label %exit1 + +exit1: ; preds = %LeafBlock, %LeafBlock1 + store volatile i32 9, i32 addrspace(1)* undef + unreachable +} + +; IR-LABEL: @multi_divergent_region_exit_ret_switch( +define amdgpu_kernel void @multi_divergent_region_exit_ret_switch(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 { +entry: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %tmp1 = add i32 0, %tmp + %tmp2 = zext i32 %tmp1 to i64 + %tmp3 = add i64 0, %tmp2 + %tmp4 = shl i64 %tmp3, 32 + %tmp5 = ashr exact i64 %tmp4, 32 + %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5 + %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4 + %tmp8 = sext i32 %tmp7 to i64 + %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8 + %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4 + %tmp13 = zext i32 %tmp10 to i64 + %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13 + %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16 + switch i32 %tmp16, label %exit1 + [ i32 1, label %LeafBlock + i32 2, label %LeafBlock1 + i32 3, label %exit0 ] + +LeafBlock: ; preds = %entry + %SwitchLeaf = icmp eq i32 %tmp16, 1 + br i1 %SwitchLeaf, label %exit0, label %exit1 + +LeafBlock1: ; preds = %entry + %SwitchLeaf2 = icmp eq i32 %tmp16, 2 + br i1 %SwitchLeaf2, label %exit0, label %exit1 + +exit0: ; preds = %LeafBlock, %LeafBlock1 + store volatile i32 17, i32 addrspace(3)* undef + ret void + +exit1: ; preds = %LeafBlock, %LeafBlock1 + store volatile i32 9, i32 addrspace(1)* undef + unreachable +} + +; IR-LABEL: @divergent_multi_ret_nest_in_uniform_triangle( +define amdgpu_kernel void @divergent_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 { +entry: + %uniform.cond0 = icmp eq i32 %arg0, 4 + br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret + +divergent.multi.exit.region: + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %divergent.cond0 = icmp eq i32 %id.x, 0 + br i1 %divergent.cond0, label %divergent.ret0, label %divergent.ret1 + +divergent.ret0: + store volatile i32 11, i32 addrspace(3)* undef + ret void + +divergent.ret1: + store volatile i32 42, i32 addrspace(3)* undef + ret void + +uniform.ret: + store volatile i32 9, i32 addrspace(1)* undef + ret void +} + +; IR-LABEL: @divergent_complex_multi_ret_nest_in_uniform_triangle( +define amdgpu_kernel void @divergent_complex_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 { +entry: + %uniform.cond0 = icmp eq i32 %arg0, 4 + br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret + +divergent.multi.exit.region: + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %divergent.cond0 = icmp eq i32 %id.x, 0 + br i1 %divergent.cond0, label %divergent.if, label %divergent.ret1 + +divergent.if: + %vgpr0 = load volatile float, float addrspace(1)* undef + %divergent.cond1 = fcmp ogt float %vgpr0, 1.0 + br i1 %divergent.cond1, label %divergent.then, label %divergent.endif + +divergent.then: + %vgpr1 = load volatile float, float addrspace(1)* undef + %divergent.cond2 = fcmp olt float %vgpr1, 4.0 + store volatile i32 33, i32 addrspace(1)* undef + br i1 %divergent.cond2, label %divergent.ret0, label %divergent.endif + +divergent.endif: + store volatile i32 38, i32 addrspace(1)* undef + br label %divergent.ret0 + +divergent.ret0: + store volatile i32 11, i32 addrspace(3)* undef + ret void + +divergent.ret1: + store volatile i32 42, i32 addrspace(3)* undef + ret void + +uniform.ret: + store volatile i32 9, i32 addrspace(1)* undef + ret void +} + +; IR-LABEL: @uniform_complex_multi_ret_nest_in_divergent_triangle( +; IR: Flow1: ; preds = %uniform.ret1, %uniform.multi.exit.region +; IR: %8 = phi i1 [ false, %uniform.ret1 ], [ true, %uniform.multi.exit.region ] +; IR: br i1 %8, label %uniform.if, label %Flow2 + +; IR: Flow: ; preds = %uniform.then, %uniform.if +; IR: %11 = phi i1 [ %10, %uniform.then ], [ %9, %uniform.if ] +; IR: br i1 %11, label %uniform.endif, label %uniform.ret0 + +; IR: UnifiedReturnBlock: ; preds = %Flow3, %Flow2 +; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %6) +; IR-NEXT: ret void +define amdgpu_kernel void @uniform_complex_multi_ret_nest_in_divergent_triangle(i32 %arg0) #0 { +entry: + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %divergent.cond0 = icmp eq i32 %id.x, 0 + br i1 %divergent.cond0, label %uniform.multi.exit.region, label %divergent.ret + +uniform.multi.exit.region: + %uniform.cond0 = icmp eq i32 %arg0, 4 + br i1 %uniform.cond0, label %uniform.if, label %uniform.ret1 + +uniform.if: + %sgpr0 = load volatile i32, i32 addrspace(2)* undef + %uniform.cond1 = icmp slt i32 %sgpr0, 1 + br i1 %uniform.cond1, label %uniform.then, label %uniform.endif + +uniform.then: + %sgpr1 = load volatile i32, i32 addrspace(2)* undef + %uniform.cond2 = icmp sge i32 %sgpr1, 4 + store volatile i32 33, i32 addrspace(1)* undef + br i1 %uniform.cond2, label %uniform.ret0, label %uniform.endif + +uniform.endif: + store volatile i32 38, i32 addrspace(1)* undef + br label %uniform.ret0 + +uniform.ret0: + store volatile i32 11, i32 addrspace(3)* undef + ret void + +uniform.ret1: + store volatile i32 42, i32 addrspace(3)* undef + ret void + +divergent.ret: + store volatile i32 9, i32 addrspace(1)* undef + ret void +} + +; IR-LABEL: @multi_divergent_unreachable_exit( +; IR: UnifiedUnreachableBlock: +; IR-NEXT: call void @llvm.amdgcn.unreachable() +; IR-NEXT: br label %UnifiedReturnBlock + +; IR: UnifiedReturnBlock: +; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 +; IR-NEXT: ret void +define amdgpu_kernel void @multi_divergent_unreachable_exit() #0 { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + switch i32 %tmp, label %bb3 [ + i32 2, label %bb1 + i32 0, label %bb2 + ] + +bb1: ; preds = %bb + unreachable + +bb2: ; preds = %bb + unreachable + +bb3: ; preds = %bb + switch i32 undef, label %bb5 [ + i32 2, label %bb4 + ] + +bb4: ; preds = %bb3 + ret void + +bb5: ; preds = %bb3 + unreachable +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/ret_jump.ll =================================================================== --- test/CodeGen/AMDGPU/ret_jump.ll +++ test/CodeGen/AMDGPU/ret_jump.ll @@ -4,20 +4,78 @@ ; This should end with an no-op sequence of exec mask manipulations ; Mask should be in original state after executed unreachable block -; GCN-LABEL: {{^}}main: + +; GCN-LABEL: {{^}}uniform_br_trivial_ret_divergent_br_trivial_unreachable: ; GCN: s_cbranch_scc1 [[RET_BB:BB[0-9]+_[0-9]+]] +; GCN-NEXT: ; %else + ; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc ; GCN-NEXT: s_xor_b64 [[XOR_EXEC:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_EXEC]] -; GCN-NEXT: ; mask branch [[UNREACHABLE_BB:BB[0-9]+_[0-9]+]] +; GCN-NEXT: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]] -; GCN: [[RET_BB]]: -; GCN-NEXT: s_branch [[FINAL_BB:BB[0-9]+_[0-9]+]] +; GCN: BB{{[0-9]+_[0-9]+}}: ; %unreachable.bb +; GCN-NEXT: ; divergent unreachable -; GCN-NEXT: [[UNREACHABLE_BB]]: -; GCN-NEXT: [[FINAL_BB]]: +; GCN-NEXT: {{^}}[[FLOW]]: ; %Flow +; GCN-NEXT: s_or_b64 exec, exec + +; GCN-NEXT: [[RET_BB]]: +; GCN-NEXT: ; return ; GCN-NEXT: .Lfunc_end0 -define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, i32 %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 { +define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_trivial_ret_divergent_br_trivial_unreachable([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, i32 inreg %arg17, i32 %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 { +entry: + %i.i = extractelement <2 x i32> %arg7, i32 0 + %j.i = extractelement <2 x i32> %arg7, i32 1 + %i.f.i = bitcast i32 %i.i to float + %j.f.i = bitcast i32 %j.i to float + %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 1, i32 0, i32 %arg5) #2 + %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 1, i32 0, i32 %arg5) #2 + %p87 = fmul float undef, %p2.i + %p88 = fadd float %p87, undef + %p93 = fadd float %p88, undef + %p97 = fmul float %p93, undef + %p102 = fsub float %p97, undef + %p104 = fmul float %p102, undef + %p106 = fadd float 0.000000e+00, %p104 + %p108 = fadd float undef, %p106 + %uniform.cond = icmp slt i32 %arg17, 0 + br i1 %uniform.cond, label %ret.bb, label %else + +else: ; preds = %main_body + %p124 = fmul float %p108, %p108 + %p125 = fsub float %p124, undef + %divergent.cond = fcmp olt float %p125, 0.000000e+00 + br i1 %divergent.cond, label %ret.bb, label %unreachable.bb + +unreachable.bb: ; preds = %else + unreachable + +ret.bb: ; preds = %else, %main_body + ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef +} + +; GCN-LABEL: {{^}}uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable: +; GCN: s_cbranch_vccnz [[RET_BB:BB[0-9]+_[0-9]+]] + +; GCN: ; BB#{{[0-9]+}}: ; %else +; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc +; GCN-NEXT: s_xor_b64 [[XOR_EXEC:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_EXEC]] +; GCN-NEXT: ; mask branch [[FLOW1:BB[0-9]+_[0-9]+]] + +; GCN-NEXT: ; %unreachable.bb +; GCN: ds_write_b32 +; GCN: s_waitcnt +; GCN: ; divergent unreachable + +; GCN: ; %ret.bb +; GCN: store_dword + +; GCN: ; %UnifiedReturnBlock +; GCN-NEXT: s_or_b64 exec, exec +; GCN-NEXT: ; return +; GCN-NEXT: .Lfunc_end +define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, i32 inreg %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 { main_body: %i.i = extractelement <2 x i32> %arg7, i32 0 %j.i = extractelement <2 x i32> %arg7, i32 1 @@ -33,18 +91,21 @@ %p104 = fmul float %p102, undef %p106 = fadd float 0.000000e+00, %p104 %p108 = fadd float undef, %p106 - br i1 undef, label %ENDIF69, label %ELSE + %uniform.cond = icmp slt i32 %arg18, 0 + br i1 %uniform.cond, label %ret.bb, label %else -ELSE: ; preds = %main_body +else: ; preds = %main_body %p124 = fmul float %p108, %p108 %p125 = fsub float %p124, undef - %p126 = fcmp olt float %p125, 0.000000e+00 - br i1 %p126, label %ENDIF69, label %ELSE41 + %divergent.cond = fcmp olt float %p125, 0.000000e+00 + br i1 %divergent.cond, label %ret.bb, label %unreachable.bb -ELSE41: ; preds = %ELSE +unreachable.bb: ; preds = %else + store volatile i32 8, i32 addrspace(3)* undef unreachable -ENDIF69: ; preds = %ELSE, %main_body +ret.bb: ; preds = %else, %main_body + store volatile i32 11, i32 addrspace(1)* undef ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef } Index: test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll =================================================================== --- test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll +++ test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll @@ -6,7 +6,7 @@ ; OPT-NOT: call i1 @llvm.amdgcn.loop ; GCN-LABEL: {{^}}annotate_unreachable_noloop: -; GCN: s_cbranch_vccnz +; GCN: s_cbranch_scc1 ; GCN-NOT: s_endpgm ; GCN: .Lfunc_end0 define void @annotate_unreachable_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 { @@ -37,9 +37,14 @@ ; OPT-NOT: call i1 @llvm.amdgcn.loop ; GCN-LABEL: {{^}}annotate_ret_noloop: -; GCN: s_cbranch_scc1 -; GCN: s_endpgm -; GCN: .Lfunc_end1 +; GCN: load_dwordx4 +; GCN: v_cmp_nlt_f32 +; GCN: s_and_saveexec_b64 +; GCN: ; mask branch [[UNIFIED_RET:BB[0-9]+_[0-9]+]] +; GCN-NEXT: [[UNIFIED_RET]]: +; GCN-NEXT: s_or_b64 exec, exec +; GCN-NEXT: s_endpgm +; GCN: .Lfunc_end define void @annotate_ret_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -49,6 +54,38 @@ %tmp2 = sext i32 %tmp to i64 %tmp3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i64 %tmp2 %tmp4 = load <4 x float>, <4 x float> addrspace(1)* %tmp3, align 16 + %tmp5 = extractelement <4 x float> %tmp4, i32 1 + store volatile <4 x float> %tmp4, <4 x float> addrspace(1)* undef + %cmp = fcmp ogt float %tmp5, 1.0 + br i1 %cmp, label %bb5, label %bb3 + +bb3: ; preds = %bb1 + %tmp6 = extractelement <4 x float> %tmp4, i32 2 + %tmp7 = fcmp olt float %tmp6, 0.000000e+00 + br i1 %tmp7, label %bb4, label %bb5 ; crash goes away if these are swapped + +bb4: ; preds = %bb3 + ret void + +bb5: ; preds = %bb3, %bb1 + ret void +} + +; OPT-LABEL: @uniform_annotate_ret_noloop( +; OPT-NOT: call i1 @llvm.amdgcn.loop + +; GCN-LABEL: {{^}}uniform_annotate_ret_noloop: +; GCN: s_cbranch_scc1 +; GCN: s_endpgm +; GCN: .Lfunc_end +define void @uniform_annotate_ret_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg, i32 %tmp) #0 { +bb: + br label %bb1 + +bb1: ; preds = %bb + %tmp2 = sext i32 %tmp to i64 + %tmp3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i64 %tmp2 + %tmp4 = load <4 x float>, <4 x float> addrspace(1)* %tmp3, align 16 br i1 undef, label %bb5, label %bb3 bb3: ; preds = %bb1 Index: test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll =================================================================== --- test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll +++ test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll @@ -4,16 +4,17 @@ ; GCN: v_cmp_eq_u32 ; GCN: s_and_saveexec_b64 ; GCN: s_xor_b64 -; GCN: ; mask branch [[RET:BB[0-9]+]] -; GCN: s_branch [[UNREACHABLE:BB[0-9]+_[0-9]+]] +; GCN: ; mask branch [[RET:BB[0-9]+_[0-9]+]] -; GCN: [[RET]] -; GCN: s_or_b64 exec, exec -; GCN: s_endpgm - -; GCN: [[UNREACHABLE]]: +; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %unreachable ; GCN: ds_write_b32 +; GCN: ; divergent unreachable ; GCN: s_waitcnt + +; GCN-NEXT: [[RET]]: ; %UnifiedReturnBlock +; GCN-NEXT: s_or_b64 exec, exec +; GCN: s_endpgm + define void @lower_control_flow_unreachable_terminator() #0 { bb: %tmp15 = tail call i32 @llvm.amdgcn.workitem.id.y() @@ -29,18 +30,19 @@ } ; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator_swap_block_order: -; GCN: v_cmp_eq_u32 +; GCN: v_cmp_ne_u32 ; GCN: s_and_saveexec_b64 ; GCN: s_xor_b64 -; GCN: ; mask branch [[UNREACHABLE:BB[0-9]+_[0-9]+]] +; GCN: ; mask branch [[RETURN:BB[0-9]+_[0-9]+]] -; GCN-NEXT: ; %ret -; GCN-NEXT: s_endpgm - -; GCN-NEXT: [[UNREACHABLE]]: -; GCN-NEXT: s_or_b64 exec, exec +; GCN-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %unreachable ; GCN: ds_write_b32 +; GCN: ; divergent unreachable ; GCN: s_waitcnt + +; GCN: [[RETURN]]: +; GCN-NEXT: s_or_b64 exec, exec +; GCN-NEXT: s_endpgm define void @lower_control_flow_unreachable_terminator_swap_block_order() #0 { bb: %tmp15 = tail call i32 @llvm.amdgcn.workitem.id.y() @@ -55,7 +57,29 @@ unreachable } -; Function Attrs: nounwind readnone +; GCN-LABEL: {{^}}uniform_lower_control_flow_unreachable_terminator: +; GCN: s_cmp_lg_u32 +; GCN: s_cbranch_scc0 [[UNREACHABLE:BB[0-9]+_[0-9]+]] + +; GCN-NEXT: BB#{{[0-9]+}}: ; %ret +; GCN-NEXT: s_endpgm + +; GCN: [[UNREACHABLE]]: +; GCN: ds_write_b32 +; GCN: s_waitcnt +define void @uniform_lower_control_flow_unreachable_terminator(i32 %arg0) #0 { +bb: + %tmp63 = icmp eq i32 %arg0, 32 + br i1 %tmp63, label %unreachable, label %ret + +unreachable: + store volatile i32 0, i32 addrspace(3)* undef, align 4 + unreachable + +ret: + ret void +} + declare i32 @llvm.amdgcn.workitem.id.y() #1 attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/valu-i1.ll =================================================================== --- test/CodeGen/AMDGPU/valu-i1.ll +++ test/CodeGen/AMDGPU/valu-i1.ll @@ -64,29 +64,100 @@ ret void } -; SI-LABEL: @simple_test_v_if +; SI-LABEL: {{^}}simple_test_v_if: ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc ; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]] +; SI: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]] -; SI: BB{{[0-9]+_[0-9]+}}: +; SI-NEXT: BB{{[0-9]+_[0-9]+}}: ; SI: buffer_store_dword -; SI: s_endpgm +; SI-NEXT: s_waitcnt -; SI: BB1_2: +; SI-NEXT: {{^}}[[EXIT]]: ; SI: s_or_b64 exec, exec, [[BR_SREG]] ; SI: s_endpgm define void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %is.0 = icmp ne i32 %tid, 0 - br i1 %is.0, label %store, label %exit + br i1 %is.0, label %then, label %exit + +then: + %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid + store i32 999, i32 addrspace(1)* %gep + br label %exit + +exit: + ret void +} + +; FIXME: It would be better to endpgm in the then block. + +; SI-LABEL: {{^}}simple_test_v_if_ret_else_ret: +; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} +; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc +; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]] +; SI: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]] + +; SI-NEXT: BB{{[0-9]+_[0-9]+}}: +; SI: buffer_store_dword +; SI-NEXT: s_waitcnt + +; SI-NEXT: {{^}}[[EXIT]]: +; SI: s_or_b64 exec, exec, [[BR_SREG]] +; SI: s_endpgm +define void @simple_test_v_if_ret_else_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %is.0 = icmp ne i32 %tid, 0 + br i1 %is.0, label %then, label %exit + +then: + %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid + store i32 999, i32 addrspace(1)* %gep + ret void + +exit: + ret void +} + +; Final block has more than a ret to execute. This was miscompiled +; before function exit blocks were unified since the endpgm would +; terminate the then wavefront before reaching the store. + +; SI-LABEL: {{^}}simple_test_v_if_ret_else_code_ret: +; SI: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}} +; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc +; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]] +; SI: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]] + +; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %exit +; SI: ds_write_b32 +; SI: s_waitcnt + +; SI-NEXT: {{^}}[[FLOW]]: +; SI-NEXT: s_or_saveexec_b64 +; SI-NEXT: s_xor_b64 exec, exec +; SI-NEXT: ; mask branch [[UNIFIED_RETURN:BB[0-9]+_[0-9]+]] + +; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %then +; SI: buffer_store_dword +; SI-NEXT: s_waitcnt + +; SI-NEXT: {{^}}[[UNIFIED_RETURN]]: ; %UnifiedReturnBlock +; SI: s_or_b64 exec, exec +; SI: s_endpgm +define void @simple_test_v_if_ret_else_code_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %is.0 = icmp ne i32 %tid, 0 + br i1 %is.0, label %then, label %exit -store: +then: %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid store i32 999, i32 addrspace(1)* %gep ret void exit: + store volatile i32 7, i32 addrspace(3)* undef ret void }