diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -345,6 +345,9 @@ void initializeGCNNSAReassignPass(PassRegistry &); extern char &GCNNSAReassignID; +void initializeGCNPreRABranchDistancePass(PassRegistry &); +extern char &GCNPreRABranchDistanceID; + void initializeGCNPreRAOptimizationsPass(PassRegistry &); extern char &GCNPreRAOptimizationsID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -412,6 +412,7 @@ initializeAMDGPUResourceUsageAnalysisPass(*PR); initializeGCNNSAReassignPass(*PR); initializeGCNPreRAOptimizationsPass(*PR); + initializeGCNPreRABranchDistancePass(*PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -1326,6 +1327,8 @@ if (!usingDefaultRegAlloc()) report_fatal_error(RegAllocOptNotSupportedMessage); + addPass(&GCNPreRABranchDistanceID); + addPass(createSGPRAllocPass(false)); // Equivalent of PEI for SGPRs. @@ -1339,6 +1342,8 @@ if (!usingDefaultRegAlloc()) report_fatal_error(RegAllocOptNotSupportedMessage); + addPass(&GCNPreRABranchDistanceID); + addPass(createSGPRAllocPass(true)); // Commit allocated register changes. This is mostly necessary because too diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -109,6 +109,7 @@ GCNIterativeScheduler.cpp GCNMinRegStrategy.cpp GCNNSAReassign.cpp + GCNPreRABranchDistance.cpp GCNPreRAOptimizations.cpp GCNRegPressure.cpp GCNSchedStrategy.cpp diff --git a/llvm/lib/Target/AMDGPU/GCNPreRABranchDistance.cpp b/llvm/lib/Target/AMDGPU/GCNPreRABranchDistance.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/GCNPreRABranchDistance.cpp @@ -0,0 +1,187 @@ +//===-- GCNPreRABranchDistance.cpp +//-----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/InitializePasses.h" +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-pre-ra-branch-distance" + +namespace { + +static cl::opt LongBranchFactor( + "amdgpu-long-branch-factor", cl::init(1.0), cl::Hidden, + cl::desc("Factor to apply to what qualifies as a long branch" + "to reserve a pair of scalar registers.")); + +class GCNPreRABranchDistance : public MachineFunctionPass { + /// BasicBlockInfo - Information about the offset and size of a single + /// basic block. + struct BasicBlockInfo { + /// Offset - Distance from the beginning of the function to the beginning + /// of this basic block. + /// + /// The offset is always aligned as required by the basic block. + unsigned Offset = 0; + + /// Size - Size of the basic block in bytes. If the block contains + /// inline assembly, this is a worst case estimate. + /// + /// The size does not include any alignment padding whether from the + /// beginning of the block, or from an aligned jump table at the end. + unsigned Size = 0; + + BasicBlockInfo() = default; + + /// Compute the offset immediately following this block. \p MBB is the next + /// block. + unsigned postOffset(const MachineBasicBlock &MBB) const { + const unsigned PO = Offset + Size; + const Align Alignment = MBB.getAlignment(); + const Align ParentAlign = MBB.getParent()->getAlignment(); + if (Alignment <= ParentAlign) + return alignTo(PO, Alignment); + + // The alignment of this MBB is larger than the function's alignment, so + // we can't tell whether or not it will insert nops. Assume that it will. + return alignTo(PO, Alignment) + Alignment.value() - ParentAlign.value(); + } + }; + SmallVector BlockInfo; + MachineFunction *MF; + const SIInstrInfo *TII; + void scanFunction(); + void adjustBlockOffsets(MachineBasicBlock &Start); + uint64_t computeBlockSize(const MachineBasicBlock &MBB) const; + unsigned getInstrOffset(const MachineInstr &MI) const; + +public: + static char ID; + GCNPreRABranchDistance() : MachineFunctionPass(ID) { + initializeGCNPreRABranchDistancePass(*PassRegistry::getPassRegistry()); + } + void scanFunction(MachineFunction &MF); + bool runOnMachineFunction(MachineFunction &MF) override; + StringRef getPassName() const override { + return "AMDGPU Pre-RA Branch Distance"; + } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; +} // End anonymous namespace. +char GCNPreRABranchDistance::ID = 0; + +INITIALIZE_PASS_BEGIN(GCNPreRABranchDistance, DEBUG_TYPE, + "AMDGPU Pre-RA Branch Distance", false, false) +INITIALIZE_PASS_END(GCNPreRABranchDistance, DEBUG_TYPE, + "Pre-RA Branch Distance", false, false) + +char &llvm::GCNPreRABranchDistanceID = GCNPreRABranchDistance::ID; + +/// scanFunction - Do the initial scan of the function, building up +/// information about each block. +void GCNPreRABranchDistance::scanFunction() { + BlockInfo.clear(); + BlockInfo.resize(MF->getNumBlockIDs()); + + // First thing, compute the size of all basic blocks, and see if the function + // has any inline assembly in it. If so, we have to be conservative about + // alignment assumptions, as we don't know for sure the size of any + // instructions in the inline assembly. + for (MachineBasicBlock &MBB : *MF) + BlockInfo[MBB.getNumber()].Size = computeBlockSize(MBB); + + // Compute block offsets and known bits. + adjustBlockOffsets(*MF->begin()); +} + +uint64_t +GCNPreRABranchDistance::computeBlockSize(const MachineBasicBlock &MBB) const { + uint64_t CodeSize = 0; + for (const MachineInstr &MI : MBB) + CodeSize += TII->getInstSizeInBytes(MI); + return CodeSize; +} + +void GCNPreRABranchDistance::adjustBlockOffsets(MachineBasicBlock &Start) { + unsigned PrevNum = Start.getNumber(); + for (auto &MBB : + make_range(std::next(MachineFunction::iterator(Start)), MF->end())) { + unsigned Num = MBB.getNumber(); + // Get the offset and known bits at the end of the layout predecessor. + // Include the alignment of the current block. + BlockInfo[Num].Offset = BlockInfo[PrevNum].postOffset(MBB); + + PrevNum = Num; + } +} + +/// getInstrOffset - Return the current offset of the specified machine +/// instruction from the start of the function. This offset changes as stuff is +/// moved around inside the function. +unsigned GCNPreRABranchDistance::getInstrOffset(const MachineInstr &MI) const { + const MachineBasicBlock *MBB = MI.getParent(); + + // The offset is composed of two things: the sum of the sizes of all MBB's + // before this instruction's block, and the offset from the start of the block + // it is in. + unsigned Offset = BlockInfo[MBB->getNumber()].Offset; + + // Sum instructions before MI in MBB. + for (MachineBasicBlock::const_iterator I = MBB->begin(); &*I != &MI; ++I) { + assert(I != MBB->end() && "Didn't find MI in its own basic block?"); + Offset += TII->getInstSizeInBytes(*I); + } + + return Offset; +} + +bool GCNPreRABranchDistance::runOnMachineFunction(MachineFunction &Fn) { + MF = &Fn; + const GCNSubtarget &STM = MF->getSubtarget(); + TII = STM.getInstrInfo(); + SIMachineFunctionInfo *MFI = MF->getInfo(); + + // Do the initial scan of the function, building up information about the + // sizes of each block. + scanFunction(); + + for (MachineBasicBlock &MBB : *MF) { + MachineBasicBlock::iterator Last = MBB.getLastNonDebugInstr(); + if (Last == MBB.end()) + continue; + if (Last->isConditionalBranch()) + continue; + if (Last->isUnconditionalBranch()) { + MachineBasicBlock *DestBB = TII->getBranchDestBlock(*Last); + int64_t DestOffset = BlockInfo[DestBB->getNumber()].Offset; + int64_t SrcOffset = getInstrOffset(*Last); + int64_t Offset = + static_cast(LongBranchFactor * (DestOffset - SrcOffset)); + if (!TII->isBranchOffsetInRange(Last->getOpcode(), Offset)) { + MFI->setLongBranchReservedReg(); + return false; + } + } + } + return false; +} diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2534,6 +2534,7 @@ MachineFunction *MF = MBB.getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); + SIMachineFunctionInfo *MFI = MF->getInfo(); // FIXME: Virtual register workaround for RegScavenger not working with empty // blocks. @@ -2601,6 +2602,12 @@ Register Scav = RS->scavengeRegisterBackwards( AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC), /* RestoreAfter */ false, 0, /* AllowSpill */ false); + + Register LongBranchReservedReg = MFI->getLongBranchReservedReg(); + if (!Scav && LongBranchReservedReg != AMDGPU::NoRegister) { + Scav = LongBranchReservedReg; + } + if (Scav) { RS->setRegUsed(Scav); MRI.replaceRegWith(PCReg, Scav); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -381,6 +381,8 @@ // base to the beginning of the new function's frame. Register StackPtrOffsetReg = AMDGPU::SP_REG; + Register LongBranchReservedReg = AMDGPU::NoRegister; + AMDGPUFunctionArgInfo ArgInfo; // Graphics info. @@ -888,6 +890,10 @@ StackPtrOffsetReg = Reg; } + void setLongBranchReservedReg(Register Reg = AMDGPU::SGPR6_SGPR7) { + LongBranchReservedReg = Reg; + } + // Note the unset value for this is AMDGPU::SP_REG rather than // NoRegister. This is mostly a workaround for MIR tests where state that // can't be directly computed from the function is not preserved in serialized @@ -896,6 +902,8 @@ return StackPtrOffsetReg; } + Register getLongBranchReservedReg() const { return LongBranchReservedReg; } + Register getQueuePtrUserSGPR() const { return ArgInfo.QueuePtr.getRegister(); } diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -120,6 +120,7 @@ ; GCN-O0-NEXT: Virtual Register Map ; GCN-O0-NEXT: Live Register Matrix ; GCN-O0-NEXT: SI Pre-allocate WWM Registers +; GCN-O0-NEXT: AMDGPU Pre-RA Branch Distance ; GCN-O0-NEXT: Fast Register Allocator ; GCN-O0-NEXT: SI lower SGPR spill instructions ; GCN-O0-NEXT: Fast Register Allocator @@ -355,6 +356,7 @@ ; GCN-O1-NEXT: Live Register Matrix ; GCN-O1-NEXT: SI Pre-allocate WWM Registers ; GCN-O1-NEXT: SI optimize exec mask operations pre-RA +; GCN-O1-NEXT: AMDGPU Pre-RA Branch Distance ; GCN-O1-NEXT: Machine Natural Loop Construction ; GCN-O1-NEXT: Machine Block Frequency Analysis ; GCN-O1-NEXT: Debug Variable Analysis @@ -663,6 +665,7 @@ ; GCN-O1-OPTS-NEXT: Live Register Matrix ; GCN-O1-OPTS-NEXT: SI Pre-allocate WWM Registers ; GCN-O1-OPTS-NEXT: SI optimize exec mask operations pre-RA +; GCN-O1-OPTS-NEXT: AMDGPU Pre-RA Branch Distance ; GCN-O1-OPTS-NEXT: Machine Natural Loop Construction ; GCN-O1-OPTS-NEXT: Machine Block Frequency Analysis ; GCN-O1-OPTS-NEXT: Debug Variable Analysis @@ -973,6 +976,7 @@ ; GCN-O2-NEXT: SI Pre-allocate WWM Registers ; GCN-O2-NEXT: SI optimize exec mask operations pre-RA ; GCN-O2-NEXT: SI Form memory clauses +; GCN-O2-NEXT: AMDGPU Pre-RA Branch Distance ; GCN-O2-NEXT: Machine Natural Loop Construction ; GCN-O2-NEXT: Machine Block Frequency Analysis ; GCN-O2-NEXT: Debug Variable Analysis @@ -1294,6 +1298,7 @@ ; GCN-O3-NEXT: SI Pre-allocate WWM Registers ; GCN-O3-NEXT: SI optimize exec mask operations pre-RA ; GCN-O3-NEXT: SI Form memory clauses +; GCN-O3-NEXT: AMDGPU Pre-RA Branch Distance ; GCN-O3-NEXT: Machine Natural Loop Construction ; GCN-O3-NEXT: Machine Block Frequency Analysis ; GCN-O3-NEXT: Debug Variable Analysis diff --git a/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll b/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll @@ -0,0 +1,328 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-s-branch-bits=4 -amdgpu-long-branch-factor=1.0 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s + +; OBJ: Relocations [ +; OBJ-NEXT: ] + +; Used to emit an always 4 byte instruction. Inline asm always assumes +; each instruction is the maximum size. +declare void @llvm.amdgcn.s.sleep(i32) #0 + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + + +define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 { +; GCN-LABEL: uniform_conditional_max_short_forward_branch: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s2, 0 +; GCN-NEXT: s_cbranch_scc1 .LBB0_2 +; GCN-NEXT: ; %bb.1: ; %bb2 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_sleep 0 +; GCN-NEXT: .LBB0_2: ; %bb3 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm +bb: + %cmp = icmp eq i32 %cnd, 0 + br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch + +bb2: +; 24 bytes + call void asm sideeffect + "v_nop_e64 + v_nop_e64 + v_nop_e64", ""() #0 + call void @llvm.amdgcn.s.sleep(i32 0) + br label %bb3 + +bb3: + store volatile i32 %cnd, ptr addrspace(1) %arg + ret void +} + +define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 { +; GCN-LABEL: uniform_conditional_min_long_forward_branch: +; GCN: ; %bb.0: ; %bb0 +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s2, 0 +; GCN-NEXT: s_cbranch_scc0 .LBB1_1 +; GCN-NEXT: .LBB1_3: ; %bb0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: .Lpost_getpc0: +; GCN-NEXT: s_add_u32 s4, s4, (.LBB1_2-.Lpost_getpc0)&4294967295 +; GCN-NEXT: s_addc_u32 s5, s5, (.LBB1_2-.Lpost_getpc0)>>32 +; GCN-NEXT: s_setpc_b64 s[4:5] +; GCN-NEXT: .LBB1_1: ; %bb2 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: .LBB1_2: ; %bb3 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm +bb0: + %cmp = icmp eq i32 %cnd, 0 + br i1 %cmp, label %bb3, label %bb2 ; +9 dword branch + +bb2: +; 32 bytes + call void asm sideeffect + "v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64", ""() #0 + br label %bb3 + +bb3: + store volatile i32 %cnd, ptr addrspace(1) %arg + ret void +} + +define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr addrspace(1) %arg, float %cnd) #0 { +; GCN-LABEL: uniform_conditional_min_long_forward_vcnd_branch: +; GCN: ; %bb.0: ; %bb0 +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_f32_e64 s[4:5], s2, 0 +; GCN-NEXT: s_and_b64 vcc, exec, s[4:5] +; GCN-NEXT: s_cbranch_vccz .LBB2_1 +; GCN-NEXT: .LBB2_3: ; %bb0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: .Lpost_getpc1: +; GCN-NEXT: s_add_u32 s4, s4, (.LBB2_2-.Lpost_getpc1)&4294967295 +; GCN-NEXT: s_addc_u32 s5, s5, (.LBB2_2-.Lpost_getpc1)>>32 +; GCN-NEXT: s_setpc_b64 s[4:5] +; GCN-NEXT: .LBB2_1: ; %bb2 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; 32 bytes +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: .LBB2_2: ; %bb3 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm +bb0: + %cmp = fcmp oeq float %cnd, 0.0 + br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch + +bb2: + call void asm sideeffect " ; 32 bytes + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64", ""() #0 + br label %bb3 + +bb3: + store volatile float %cnd, ptr addrspace(1) %arg + ret void +} + +define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 { +; GCN-LABEL: min_long_forward_vbranch: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN-NEXT: s_cbranch_execnz .LBB3_1 +; GCN-NEXT: .LBB3_3: ; %bb +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: .Lpost_getpc2: +; GCN-NEXT: s_add_u32 s4, s4, (.LBB3_2-.Lpost_getpc2)&4294967295 +; GCN-NEXT: s_addc_u32 s5, s5, (.LBB3_2-.Lpost_getpc2)>>32 +; GCN-NEXT: s_setpc_b64 s[4:5] +; GCN-NEXT: .LBB3_1: ; %bb2 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; 32 bytes +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: .LBB3_2: ; %bb3 +; GCN-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-NEXT: s_mov_b32 s0, s2 +; GCN-NEXT: s_mov_b32 s1, s2 +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm +bb: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = zext i32 %tid to i64 + %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tid.ext + %load = load volatile i32, ptr addrspace(1) %gep + %cmp = icmp eq i32 %load, 0 + br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch + +bb2: + call void asm sideeffect " ; 32 bytes + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64", ""() #0 + br label %bb3 + +bb3: + store volatile i32 %load, ptr addrspace(1) %gep + ret void +} + +define amdgpu_kernel void @long_backward_sbranch(ptr addrspace(1) %arg) #0 { +; GCN-LABEL: long_backward_sbranch: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: .LBB4_1: ; %bb2 +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: s_add_i32 s0, s0, 1 +; GCN-NEXT: s_cmp_lt_i32 s0, 10 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_cbranch_scc0 .LBB4_2 +; GCN-NEXT: .LBB4_3: ; %bb2 +; GCN-NEXT: ; in Loop: Header=BB4_1 Depth=1 +; GCN-NEXT: s_getpc_b64 s[2:3] +; GCN-NEXT: .Lpost_getpc3: +; GCN-NEXT: s_add_u32 s2, s2, (.LBB4_1-.Lpost_getpc3)&4294967295 +; GCN-NEXT: s_addc_u32 s3, s3, (.LBB4_1-.Lpost_getpc3)>>32 +; GCN-NEXT: s_setpc_b64 s[2:3] +; GCN-NEXT: .LBB4_2: ; %bb3 +; GCN-NEXT: s_endpgm +bb: + br label %bb2 + +bb2: + %loop.idx = phi i32 [ 0, %bb ], [ %inc, %bb2 ] + ; 24 bytes + call void asm sideeffect + "v_nop_e64 + v_nop_e64 + v_nop_e64", ""() #0 + %inc = add nsw i32 %loop.idx, 1 ; add cost 4 + %cmp = icmp slt i32 %inc, 10 ; condition cost = 8 + br i1 %cmp, label %bb2, label %bb3 ; - + +bb3: + ret void +} + +; Requires expansion of unconditional branch from %bb2 to %bb4 (and +; expansion of conditional branch from %bb to %bb3. + +define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) { +; GCN-LABEL: uniform_unconditional_min_long_forward_branch: +; GCN: ; %bb.0: ; %bb0 +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s2, 0 +; GCN-NEXT: s_mov_b64 s[2:3], -1 +; GCN-NEXT: s_cbranch_scc0 .LBB5_1 +; GCN-NEXT: .LBB5_7: ; %bb0 +; GCN-NEXT: s_getpc_b64 s[2:3] +; GCN-NEXT: .Lpost_getpc5: +; GCN-NEXT: s_add_u32 s2, s2, (.LBB5_4-.Lpost_getpc5)&4294967295 +; GCN-NEXT: s_addc_u32 s3, s3, (.LBB5_4-.Lpost_getpc5)>>32 +; GCN-NEXT: s_setpc_b64 s[2:3] +; GCN-NEXT: .LBB5_1: ; %Flow +; GCN-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN-NEXT: s_cbranch_vccnz .LBB5_3 +; GCN-NEXT: .LBB5_2: ; %bb2 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, 17 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: .LBB5_3: ; %bb4 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 63 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm +; GCN-NEXT: .LBB5_4: ; %bb3 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_cbranch_execnz .LBB5_5 +; GCN-NEXT: .LBB5_9: ; %bb3 +; GCN-NEXT: s_getpc_b64 s[2:3] +; GCN-NEXT: .Lpost_getpc6: +; GCN-NEXT: s_add_u32 s2, s2, (.LBB5_2-.Lpost_getpc6)&4294967295 +; GCN-NEXT: s_addc_u32 s3, s3, (.LBB5_2-.Lpost_getpc6)>>32 +; GCN-NEXT: s_setpc_b64 s[2:3] +; GCN-NEXT: .LBB5_5: ; %bb3 +; GCN-NEXT: s_getpc_b64 s[2:3] +; GCN-NEXT: .Lpost_getpc4: +; GCN-NEXT: s_add_u32 s2, s2, (.LBB5_3-.Lpost_getpc4)&4294967295 +; GCN-NEXT: s_addc_u32 s3, s3, (.LBB5_3-.Lpost_getpc4)>>32 +; GCN-NEXT: s_setpc_b64 s[2:3] +bb0: + %tmp = icmp ne i32 %arg1, 0 + br i1 %tmp, label %bb2, label %bb3 + +bb2: + store volatile i32 17, ptr addrspace(1) undef + br label %bb4 + +bb3: + ; 32 byte asm + call void asm sideeffect + "v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64", ""() #0 + br label %bb4 + +bb4: + store volatile i32 63, ptr addrspace(1) %arg + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone }