diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -327,6 +327,9 @@ void initializeGCNNSAReassignPass(PassRegistry &); extern char &GCNNSAReassignID; +void initializeGCNPreRALongBranchRegPass(PassRegistry &); +extern char &GCNPreRALongBranchRegID; + void initializeGCNPreRAOptimizationsPass(PassRegistry &); extern char &GCNPreRAOptimizationsID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -421,6 +421,7 @@ initializeAMDGPUResourceUsageAnalysisPass(*PR); initializeGCNNSAReassignPass(*PR); initializeGCNPreRAOptimizationsPass(*PR); + initializeGCNPreRALongBranchRegPass(*PR); initializeGCNRewritePartialRegUsesPass(*PR); } @@ -1339,6 +1340,8 @@ if (!usingDefaultRegAlloc()) report_fatal_error(RegAllocOptNotSupportedMessage); + addPass(&GCNPreRALongBranchRegID); + addPass(createSGPRAllocPass(false)); // Equivalent of PEI for SGPRs. @@ -1352,6 +1355,8 @@ if (!usingDefaultRegAlloc()) report_fatal_error(RegAllocOptNotSupportedMessage); + addPass(&GCNPreRALongBranchRegID); + addPass(createSGPRAllocPass(true)); // Commit allocated register changes. This is mostly necessary because too @@ -1474,6 +1479,10 @@ if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy)) return true; + if (parseOptionalRegister(YamlMFI.LongBranchReservedReg, + MFI->LongBranchReservedReg)) + return true; + auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) { // Create a diagnostic for a the register string literal. const MemoryBuffer &Buffer = diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -107,6 +107,7 @@ GCNMinRegStrategy.cpp GCNNSAReassign.cpp GCNPreRAOptimizations.cpp + GCNPreRALongBranchReg.cpp GCNRegPressure.cpp GCNRewritePartialRegUses.cpp GCNSchedStrategy.cpp diff --git a/llvm/lib/Target/AMDGPU/GCNPreRALongBranchReg.cpp b/llvm/lib/Target/AMDGPU/GCNPreRALongBranchReg.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/GCNPreRALongBranchReg.cpp @@ -0,0 +1,139 @@ +//===-- GCNPreRALongBranchReg.cpp ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// \file +// \brief Pass to estimate pre RA branch size and reserve a pair of SGPRs if +// there is a long branch. Branch size at this point is difficult to track since +// we have no idea what spills will be inserted later on. We just assume 8 bytes +// per instruction to compute approximations without computing the actual +// instruction size to see if we're in the neighborhood of the maximum branch +// distrance threshold tuning of what is considered "long" is handled through +// amdgpu-long-branch-factor cl argument which sets LongBranchFactor. +//===----------------------------------------------------------------------===// +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-pre-ra-long-branch-reg" + +namespace { + +static cl::opt LongBranchFactor( + "amdgpu-long-branch-factor", cl::init(1.0), cl::Hidden, + cl::desc("Factor to apply to what qualifies as a long branch " + "to reserve a pair of scalar registers. If this value " + "is 0 the long branch registers are never reserved. As this " + "value grows the greater chance the branch distance will fall " + "within the threshold and the registers will be marked to be " + "reserved. We lean towards always reserving a register for " + "long jumps")); + +class GCNPreRALongBranchReg : public MachineFunctionPass { + + struct BasicBlockInfo { + // Offset - Distance from the beginning of the function to the beginning + // of this basic block. + uint64_t Offset = 0; + // Size - Size of the basic block in bytes + uint64_t Size = 0; + }; + void generateBlockInfo(MachineFunction &MF, + SmallVectorImpl &BlockInfo); + +public: + static char ID; + GCNPreRALongBranchReg() : MachineFunctionPass(ID) { + initializeGCNPreRALongBranchRegPass(*PassRegistry::getPassRegistry()); + } + bool runOnMachineFunction(MachineFunction &MF) override; + StringRef getPassName() const override { + return "AMDGPU Pre-RA Long Branch Reg"; + } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; +} // End anonymous namespace. +char GCNPreRALongBranchReg::ID = 0; + +INITIALIZE_PASS(GCNPreRALongBranchReg, DEBUG_TYPE, + "AMDGPU Pre-RA Long Branch Reg", false, false) + +char &llvm::GCNPreRALongBranchRegID = GCNPreRALongBranchReg::ID; +void GCNPreRALongBranchReg::generateBlockInfo( + MachineFunction &MF, SmallVectorImpl &BlockInfo) { + + BlockInfo.resize(MF.getNumBlockIDs()); + + // Approximate the size of all basic blocks by just + // assuming 8 bytes per instruction + for (const MachineBasicBlock &MBB : MF) { + uint64_t NumInstr = 0; + // Loop through the basic block and add up all non-debug + // non-meta instructions + for (const MachineInstr &MI : MBB) { + // isMetaInstruction is a superset of isDebugIstr + if (MI.isMetaInstruction()) + continue; + NumInstr += 1; + } + // Approximate size as just 8 bytes per instruction + BlockInfo[MBB.getNumber()].Size = 8 * NumInstr; + } + uint64_t PrevNum = (&MF)->begin()->getNumber(); + for (auto &MBB : + make_range(std::next(MachineFunction::iterator((&MF)->begin())), + (&MF)->end())) { + uint64_t Num = MBB.getNumber(); + // Compute the offset immediately following this block. + BlockInfo[Num].Offset = BlockInfo[PrevNum].Offset + BlockInfo[PrevNum].Size; + PrevNum = Num; + } +} +bool GCNPreRALongBranchReg::runOnMachineFunction(MachineFunction &MF) { + const GCNSubtarget &STM = MF.getSubtarget(); + const SIInstrInfo *TII = STM.getInstrInfo(); + const SIRegisterInfo *TRI = STM.getRegisterInfo(); + SIMachineFunctionInfo *MFI = MF.getInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + // For now, reserve highest available SGPR pair. After RA, + // shift down to a lower unused pair of SGPRs + // If all registers are used, then findUnusedRegister will return + // AMDGPU::NoRegister. + constexpr bool ReserveHighestRegister = true; + Register LongBranchReservedReg = TRI->findUnusedRegister( + MRI, &AMDGPU::SGPR_64RegClass, MF, ReserveHighestRegister); + if (!LongBranchReservedReg) + return false; + + // Approximate code size and offsets of each basic block + SmallVector BlockInfo; + generateBlockInfo(MF, BlockInfo); + + for (const MachineBasicBlock &MBB : MF) { + MachineBasicBlock::const_iterator Last = MBB.getLastNonDebugInstr(); + if (Last == MBB.end() || !Last->isUnconditionalBranch()) + continue; + MachineBasicBlock *DestBB = TII->getBranchDestBlock(*Last); + uint64_t BlockDistance = static_cast( + LongBranchFactor * BlockInfo[DestBB->getNumber()].Offset); + // If the distance falls outside the threshold assume it is a long branch + // and we need to reserve the registers + if (!TII->isBranchOffsetInRange(Last->getOpcode(), BlockDistance)) { + MFI->setLongBranchReservedReg(LongBranchReservedReg); + return true; + } + } + return false; +} diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -1437,13 +1437,27 @@ TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); if (UnusedLowVGPR && (TRI->getHWRegIndex(UnusedLowVGPR) < TRI->getHWRegIndex(VGPRForAGPRCopy))) { - // Call to setVGPRForAGPRCopy() should happen first before calling - // freezeReservedRegs() so that getReservedRegs() can reserve this newly - // identified VGPR (for AGPR copy). + // Reserve this newly identified VGPR (for AGPR copy) + // reserved registers should already be fronze at this point + // so we can avoid calling MRI.freezeReservedRegs and just use + // MRI.reserveReg FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR); - MRI.freezeReservedRegs(MF); + MRI.reserveReg(UnusedLowVGPR, TRI); } } + // We initally reserved the highest available SGPR pair for long branches + // now, after RA, we shift down to a lower unused one if one exists + Register LongBranchReservedReg = FuncInfo->getLongBranchReservedReg(); + Register UnusedLowSGPR = + TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_64RegClass, MF); + // If LongBranchReservedReg is null then we didn't find a long branch + // and never reserved a register to begin with so there is nothing to + // shift down. Then if UnusedLowSGPR is null, there isn't available lower + // register to use so just keep the original one we set. + if (LongBranchReservedReg && UnusedLowSGPR) { + FuncInfo->setLongBranchReservedReg(UnusedLowSGPR); + MRI.reserveReg(UnusedLowSGPR, TRI); + } } // The special SGPR spills like the one needed for FP, BP or any reserved diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2550,6 +2550,7 @@ MachineFunction *MF = MBB.getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); + const SIMachineFunctionInfo *MFI = MF->getInfo(); // FIXME: Virtual register workaround for RegScavenger not working with empty // blocks. @@ -2613,10 +2614,20 @@ // dest_bb: // buzz; - RS->enterBasicBlockEnd(MBB); - Register Scav = RS->scavengeRegisterBackwards( - AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC), - /* RestoreAfter */ false, 0, /* AllowSpill */ false); + Register LongBranchReservedReg = MFI->getLongBranchReservedReg(); + Register Scav; + + // If we've previously reserved a register for long branches + // avoid running the scavenger and just use those registers + if (LongBranchReservedReg) { + RS->enterBasicBlock(MBB); + Scav = LongBranchReservedReg; + } else { + RS->enterBasicBlockEnd(MBB); + Scav = RS->scavengeRegisterBackwards( + AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC), + /* RestoreAfter */ false, 0, /* AllowSpill */ false); + } if (Scav) { RS->setRegUsed(Scav); MRI.replaceRegWith(PCReg, Scav); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -283,6 +283,7 @@ SIMode Mode; std::optional ScavengeFI; StringValue VGPRForAGPRCopy; + StringValue LongBranchReservedReg; SIMachineFunctionInfo() = default; SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &, @@ -326,6 +327,8 @@ YamlIO.mapOptional("scavengeFI", MFI.ScavengeFI); YamlIO.mapOptional("vgprForAGPRCopy", MFI.VGPRForAGPRCopy, StringValue()); // Don't print out when it's empty. + YamlIO.mapOptional("longBranchReservedReg", MFI.LongBranchReservedReg, + StringValue()); } }; @@ -381,6 +384,11 @@ // base to the beginning of the new function's frame. Register StackPtrOffsetReg = AMDGPU::SP_REG; + // Registers that may be reserved when RA doesn't allocate enough + // registers to plan for the case where an indirect branch ends up + // being needed during branch relaxation. + Register LongBranchReservedReg; + AMDGPUFunctionArgInfo ArgInfo; // Graphics info. @@ -891,6 +899,8 @@ StackPtrOffsetReg = Reg; } + void setLongBranchReservedReg(Register Reg) { LongBranchReservedReg = Reg; } + // Note the unset value for this is AMDGPU::SP_REG rather than // NoRegister. This is mostly a workaround for MIR tests where state that // can't be directly computed from the function is not preserved in serialized @@ -899,6 +909,8 @@ return StackPtrOffsetReg; } + Register getLongBranchReservedReg() const { return LongBranchReservedReg; } + Register getQueuePtrUserSGPR() const { return ArgInfo.QueuePtr.getRegister(); } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -648,6 +648,8 @@ for (Register Reg : MFI.getWWMReservedRegs()) WWMReservedRegs.push_back(regToString(Reg, TRI)); + if (MFI.getLongBranchReservedReg()) + LongBranchReservedReg = regToString(MFI.getLongBranchReservedReg(), TRI); if (MFI.getVGPRForAGPRCopy()) VGPRForAGPRCopy = regToString(MFI.getVGPRForAGPRCopy(), TRI); auto SFI = MFI.getOptionalScavengeFI(); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -615,6 +615,10 @@ reserveRegisterTuples(Reserved, ScratchRSrcReg); } + Register LongBranchReservedReg = MFI->getLongBranchReservedReg(); + if (LongBranchReservedReg) + reserveRegisterTuples(Reserved, LongBranchReservedReg); + // We have to assume the SP is needed in case there are calls in the function, // which is detected after the function is lowered. If we aren't really going // to need SP, don't bother reserving it. @@ -2878,13 +2882,12 @@ /// Returns a lowest register that is not used at any point in the function. /// If all registers are used, then this function will return -/// AMDGPU::NoRegister. If \p ReserveHighestVGPR = true, then return +/// AMDGPU::NoRegister. If \p ReserveHighestRegister = true, then return /// highest unused register. -MCRegister SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, - const TargetRegisterClass *RC, - const MachineFunction &MF, - bool ReserveHighestVGPR) const { - if (ReserveHighestVGPR) { +MCRegister SIRegisterInfo::findUnusedRegister( + const MachineRegisterInfo &MRI, const TargetRegisterClass *RC, + const MachineFunction &MF, bool ReserveHighestRegister) const { + if (ReserveHighestRegister) { for (MCRegister Reg : reverse(*RC)) if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) return Reg; diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll --- a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -verify-machineinstrs -amdgpu-s-branch-bits=5 -o - %s | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -verify-machineinstrs -amdgpu-s-branch-bits=5 -amdgpu-long-branch-factor=0 -o - %s | FileCheck %s define amdgpu_kernel void @spill(ptr addrspace(1) %arg, i32 %cnd) #0 { ; CHECK-LABEL: spill: diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-s-branch-bits=4 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-s-branch-bits=4 -simplifycfg-require-and-preserve-domtree=1 -amdgpu-long-branch-factor=0 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; FIXME: We should use llvm-mc for this, but we can't even parse our own output. ; See PR33579. -; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-s-branch-bits=4 -o %t.o -filetype=obj -simplifycfg-require-and-preserve-domtree=1 %s +; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-s-branch-bits=4 -amdgpu-long-branch-factor=0 -o %t.o -filetype=obj -simplifycfg-require-and-preserve-domtree=1 %s ; RUN: llvm-readobj -r %t.o | FileCheck --check-prefix=OBJ %s ; OBJ: Relocations [ diff --git a/llvm/test/CodeGen/AMDGPU/literal-constant-like-operand-instruction-size.ll b/llvm/test/CodeGen/AMDGPU/literal-constant-like-operand-instruction-size.ll --- a/llvm/test/CodeGen/AMDGPU/literal-constant-like-operand-instruction-size.ll +++ b/llvm/test/CodeGen/AMDGPU/literal-constant-like-operand-instruction-size.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs -amdgpu-s-branch-bits=6 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs -amdgpu-s-branch-bits=6 -amdgpu-long-branch-factor=0 < %s | FileCheck -check-prefix=GCN %s ; Restrict maximum branch to between +31 and -32 dwords diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -114,6 +114,7 @@ ; GCN-O0-NEXT: Virtual Register Map ; GCN-O0-NEXT: Live Register Matrix ; GCN-O0-NEXT: SI Pre-allocate WWM Registers +; GCN-O0-NEXT: AMDGPU Pre-RA Long Branch Reg ; GCN-O0-NEXT: Fast Register Allocator ; GCN-O0-NEXT: SI lower SGPR spill instructions ; GCN-O0-NEXT: Fast Register Allocator @@ -349,6 +350,7 @@ ; GCN-O1-NEXT: Live Register Matrix ; GCN-O1-NEXT: SI Pre-allocate WWM Registers ; GCN-O1-NEXT: SI optimize exec mask operations pre-RA +; GCN-O1-NEXT: AMDGPU Pre-RA Long Branch Reg ; GCN-O1-NEXT: Machine Natural Loop Construction ; GCN-O1-NEXT: Machine Block Frequency Analysis ; GCN-O1-NEXT: Debug Variable Analysis @@ -648,6 +650,7 @@ ; GCN-O1-OPTS-NEXT: Live Register Matrix ; GCN-O1-OPTS-NEXT: SI Pre-allocate WWM Registers ; GCN-O1-OPTS-NEXT: SI optimize exec mask operations pre-RA +; GCN-O1-OPTS-NEXT: AMDGPU Pre-RA Long Branch Reg ; GCN-O1-OPTS-NEXT: Machine Natural Loop Construction ; GCN-O1-OPTS-NEXT: Machine Block Frequency Analysis ; GCN-O1-OPTS-NEXT: Debug Variable Analysis @@ -957,6 +960,7 @@ ; GCN-O2-NEXT: SI Pre-allocate WWM Registers ; GCN-O2-NEXT: SI optimize exec mask operations pre-RA ; GCN-O2-NEXT: SI Form memory clauses +; GCN-O2-NEXT: AMDGPU Pre-RA Long Branch Reg ; GCN-O2-NEXT: Machine Natural Loop Construction ; GCN-O2-NEXT: Machine Block Frequency Analysis ; GCN-O2-NEXT: Debug Variable Analysis @@ -1278,6 +1282,7 @@ ; GCN-O3-NEXT: SI Pre-allocate WWM Registers ; GCN-O3-NEXT: SI optimize exec mask operations pre-RA ; GCN-O3-NEXT: SI Form memory clauses +; GCN-O3-NEXT: AMDGPU Pre-RA Long Branch Reg ; GCN-O3-NEXT: Machine Natural Loop Construction ; GCN-O3-NEXT: Machine Block Frequency Analysis ; GCN-O3-NEXT: Debug Variable Analysis diff --git a/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll b/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll @@ -0,0 +1,330 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-s-branch-bits=4 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s + +; OBJ: Relocations [ +; OBJ-NEXT: ] + +; Used to emit an always 4 byte instruction. Inline asm always assumes +; each instruction is the maximum size. +declare void @llvm.amdgcn.s.sleep(i32) #0 + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + + +define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 { +; GCN-LABEL: uniform_conditional_max_short_forward_branch: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s2, 0 +; GCN-NEXT: s_cbranch_scc1 .LBB0_2 +; GCN-NEXT: ; %bb.1: ; %bb2 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_sleep 0 +; GCN-NEXT: .LBB0_2: ; %bb3 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm +bb: + %cmp = icmp eq i32 %cnd, 0 + br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch + +bb2: +; 24 bytes + call void asm sideeffect + "v_nop_e64 + v_nop_e64 + v_nop_e64", ""() #0 + call void @llvm.amdgcn.s.sleep(i32 0) + br label %bb3 + +bb3: + store volatile i32 %cnd, ptr addrspace(1) %arg + ret void +} + +define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 { +; GCN-LABEL: uniform_conditional_min_long_forward_branch: +; GCN: ; %bb.0: ; %bb0 +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s2, 0 +; GCN-NEXT: s_cbranch_scc0 .LBB1_1 +; GCN-NEXT: .LBB1_3: ; %bb0 +; GCN-NEXT: s_getpc_b64 s[8:9] +; GCN-NEXT: .Lpost_getpc0: +; GCN-NEXT: s_add_u32 s8, s8, (.LBB1_2-.Lpost_getpc0)&4294967295 +; GCN-NEXT: s_addc_u32 s9, s9, (.LBB1_2-.Lpost_getpc0)>>32 +; GCN-NEXT: s_setpc_b64 s[8:9] +; GCN-NEXT: .LBB1_1: ; %bb2 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: .LBB1_2: ; %bb3 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm +bb0: + %cmp = icmp eq i32 %cnd, 0 + br i1 %cmp, label %bb3, label %bb2 ; +9 dword branch + +bb2: +; 32 bytes + call void asm sideeffect + "v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64", ""() #0 + br label %bb3 + +bb3: + store volatile i32 %cnd, ptr addrspace(1) %arg + ret void +} + +define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr addrspace(1) %arg, float %cnd) #0 { +; GCN-LABEL: uniform_conditional_min_long_forward_vcnd_branch: +; GCN: ; %bb.0: ; %bb0 +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_f32_e64 s[4:5], s2, 0 +; GCN-NEXT: s_and_b64 vcc, exec, s[4:5] +; GCN-NEXT: s_cbranch_vccz .LBB2_1 +; GCN-NEXT: .LBB2_3: ; %bb0 +; GCN-NEXT: s_getpc_b64 s[8:9] +; GCN-NEXT: .Lpost_getpc1: +; GCN-NEXT: s_add_u32 s8, s8, (.LBB2_2-.Lpost_getpc1)&4294967295 +; GCN-NEXT: s_addc_u32 s9, s9, (.LBB2_2-.Lpost_getpc1)>>32 +; GCN-NEXT: s_setpc_b64 s[8:9] +; GCN-NEXT: .LBB2_1: ; %bb2 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; 32 bytes +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: .LBB2_2: ; %bb3 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm +bb0: + %cmp = fcmp oeq float %cnd, 0.0 + br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch + +bb2: + call void asm sideeffect " ; 32 bytes + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64", ""() #0 + br label %bb3 + +bb3: + store volatile float %cnd, ptr addrspace(1) %arg + ret void +} + +define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 { +; GCN-LABEL: min_long_forward_vbranch: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN-NEXT: s_cbranch_execnz .LBB3_1 +; GCN-NEXT: .LBB3_3: ; %bb +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: .Lpost_getpc2: +; GCN-NEXT: s_add_u32 s4, s4, (.LBB3_2-.Lpost_getpc2)&4294967295 +; GCN-NEXT: s_addc_u32 s5, s5, (.LBB3_2-.Lpost_getpc2)>>32 +; GCN-NEXT: s_setpc_b64 s[4:5] +; GCN-NEXT: .LBB3_1: ; %bb2 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; 32 bytes +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: .LBB3_2: ; %bb3 +; GCN-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-NEXT: s_mov_b32 s0, s2 +; GCN-NEXT: s_mov_b32 s1, s2 +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm +bb: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = zext i32 %tid to i64 + %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tid.ext + %load = load volatile i32, ptr addrspace(1) %gep + %cmp = icmp eq i32 %load, 0 + br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch + +bb2: + call void asm sideeffect " ; 32 bytes + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64", ""() #0 + br label %bb3 + +bb3: + store volatile i32 %load, ptr addrspace(1) %gep + ret void +} + +define amdgpu_kernel void @long_backward_sbranch(ptr addrspace(1) %arg) #0 { +; GCN-LABEL: long_backward_sbranch: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: .LBB4_1: ; %bb2 +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: s_add_i32 s0, s0, 1 +; GCN-NEXT: s_cmp_lt_i32 s0, 10 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_cbranch_scc0 .LBB4_2 +; GCN-NEXT: .LBB4_3: ; %bb2 +; GCN-NEXT: ; in Loop: Header=BB4_1 Depth=1 +; GCN-NEXT: s_getpc_b64 s[2:3] +; GCN-NEXT: .Lpost_getpc3: +; GCN-NEXT: s_add_u32 s2, s2, (.LBB4_1-.Lpost_getpc3)&4294967295 +; GCN-NEXT: s_addc_u32 s3, s3, (.LBB4_1-.Lpost_getpc3)>>32 +; GCN-NEXT: s_setpc_b64 s[2:3] +; GCN-NEXT: .LBB4_2: ; %bb3 +; GCN-NEXT: s_endpgm + +bb: + br label %bb2 + +bb2: + %loop.idx = phi i32 [ 0, %bb ], [ %inc, %bb2 ] + ; 24 bytes + call void asm sideeffect + "v_nop_e64 + v_nop_e64 + v_nop_e64", ""() #0 + %inc = add nsw i32 %loop.idx, 1 ; add cost 4 + %cmp = icmp slt i32 %inc, 10 ; condition cost = 8 + br i1 %cmp, label %bb2, label %bb3 ; - + +bb3: + ret void +} + +; Requires expansion of unconditional branch from %bb2 to %bb4 (and +; expansion of conditional branch from %bb to %bb3. + +define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) { +; GCN-LABEL: uniform_unconditional_min_long_forward_branch: +; GCN: ; %bb.0: ; %bb0 +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s2, 0 +; GCN-NEXT: s_mov_b64 s[2:3], -1 +; GCN-NEXT: s_cbranch_scc0 .LBB5_1 +; GCN-NEXT: .LBB5_7: ; %bb0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: .Lpost_getpc5: +; GCN-NEXT: s_add_u32 s4, s4, (.LBB5_4-.Lpost_getpc5)&4294967295 +; GCN-NEXT: s_addc_u32 s5, s5, (.LBB5_4-.Lpost_getpc5)>>32 +; GCN-NEXT: s_setpc_b64 s[4:5] +; GCN-NEXT: .LBB5_1: ; %Flow +; GCN-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN-NEXT: s_cbranch_vccnz .LBB5_3 +; GCN-NEXT: .LBB5_2: ; %bb2 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, 17 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: .LBB5_3: ; %bb4 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 63 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm +; GCN-NEXT: .LBB5_4: ; %bb3 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_mov_b64 vcc, exec +; GCN-NEXT: s_cbranch_execnz .LBB5_5 +; GCN-NEXT: .LBB5_9: ; %bb3 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: .Lpost_getpc6: +; GCN-NEXT: s_add_u32 s4, s4, (.LBB5_2-.Lpost_getpc6)&4294967295 +; GCN-NEXT: s_addc_u32 s5, s5, (.LBB5_2-.Lpost_getpc6)>>32 +; GCN-NEXT: s_setpc_b64 s[4:5] +; GCN-NEXT: .LBB5_5: ; %bb3 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: .Lpost_getpc4: +; GCN-NEXT: s_add_u32 s4, s4, (.LBB5_3-.Lpost_getpc4)&4294967295 +; GCN-NEXT: s_addc_u32 s5, s5, (.LBB5_3-.Lpost_getpc4)>>32 +; GCN-NEXT: s_setpc_b64 s[4:5] +bb0: + %tmp = icmp ne i32 %arg1, 0 + br i1 %tmp, label %bb2, label %bb3 + +bb2: + store volatile i32 17, ptr addrspace(1) undef + br label %bb4 + +bb3: + ; 32 byte asm + call void asm sideeffect + "v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64", ""() #0 + br label %bb4 + +bb4: + store volatile i32 63, ptr addrspace(1) %arg + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll @@ -0,0 +1,529 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-s-branch-bits=5 -stop-after=branch-relaxation %s -o - | FileCheck %s + +; Test long branch reserved register pass when all +; SGPRs are used + +; CHECK-LABEL: {{^}}name: long_branch_used_all_sgprs +; CHECK: machineFunctionInfo: +; CHECK-NEXT: explicitKernArgSize: 12 +; CHECK-NEXT: maxKernArgAlign: 8 +; CHECK-NEXT: ldsSize: 0 +; CHECK-NEXT: gdsSize: 0 +; CHECK-NEXT: dynLDSAlign: 1 +; CHECK-NEXT: isEntryFunction: true +; CHECK-NEXT: noSignedZerosFPMath: false +; CHECK-NEXT: memoryBound: false +; CHECK-NEXT: waveLimiter: false +; CHECK-NEXT: hasSpilledSGPRs: false +; CHECK-NEXT: hasSpilledVGPRs: false +; CHECK-NEXT: scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' +; CHECK-NEXT: frameOffsetReg: '$fp_reg' +; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32' +; CHECK-NEXT: bytesInStackArgArea: 0 +; CHECK-NEXT: returnsVoid: true +; CHECK-NEXT: argumentInfo: +; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr0_sgpr1' } +; CHECK-NEXT: workGroupIDX: { reg: '$sgpr2' } +; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr3' } +; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' } +; CHECK-NEXT: psInputAddr: 0 +; CHECK-NEXT: psInputEnable: 0 +; CHECK-NEXT: mode: +; CHECK-NEXT: ieee: true +; CHECK-NEXT: dx10-clamp: true +; CHECK-NEXT: fp32-input-denormals: true +; CHECK-NEXT: fp32-output-denormals: true +; CHECK-NEXT: fp64-fp16-input-denormals: true +; CHECK-NEXT: fp64-fp16-output-denormals: true +; CHECK-NEXT: highBitsOf32BitAddress: 0 +; CHECK-NEXT: occupancy: 5 +; CHECK-NEXT: scavengeFI: '%fixed-stack.0' +; CHECK-NEXT: vgprForAGPRCopy: '' +; CHECK-NEXT: longBranchReservedReg: '' +; CHECK-NEXT: body: + define amdgpu_kernel void @long_branch_used_all_sgprs(ptr addrspace(1) %arg, i32 %cnd) #0 { + entry: + %long_branch_used_all_sgprs.kernarg.segment = call nonnull align 16 dereferenceable(48) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() + %cnd.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %long_branch_used_all_sgprs.kernarg.segment, i64 44, !amdgpu.uniform !0 + %cnd.load = load i32, ptr addrspace(4) %cnd.kernarg.offset, align 4, !invariant.load !0 + %sgpr0 = tail call i32 asm sideeffect "s_mov_b32 s0, 0", "={s0}"() #1 + %sgpr1 = tail call i32 asm sideeffect "s_mov_b32 s1, 0", "={s1}"() #1 + %sgpr2 = tail call i32 asm sideeffect "s_mov_b32 s2, 0", "={s2}"() #1 + %sgpr3 = tail call i32 asm sideeffect "s_mov_b32 s3, 0", "={s3}"() #1 + %sgpr4 = tail call i32 asm sideeffect "s_mov_b32 s4, 0", "={s4}"() #1 + %sgpr5 = tail call i32 asm sideeffect "s_mov_b32 s5, 0", "={s5}"() #1 + %sgpr6 = tail call i32 asm sideeffect "s_mov_b32 s6, 0", "={s6}"() #1 + %sgpr7 = tail call i32 asm sideeffect "s_mov_b32 s7, 0", "={s7}"() #1 + %sgpr8 = tail call i32 asm sideeffect "s_mov_b32 s8, 0", "={s8}"() #1 + %sgpr9 = tail call i32 asm sideeffect "s_mov_b32 s9, 0", "={s9}"() #1 + %sgpr10 = tail call i32 asm sideeffect "s_mov_b32 s10, 0", "={s10}"() #1 + %sgpr11 = tail call i32 asm sideeffect "s_mov_b32 s11, 0", "={s11}"() #1 + %sgpr12 = tail call i32 asm sideeffect "s_mov_b32 s12, 0", "={s12}"() #1 + %sgpr13 = tail call i32 asm sideeffect "s_mov_b32 s13, 0", "={s13}"() #1 + %sgpr14 = tail call i32 asm sideeffect "s_mov_b32 s14, 0", "={s14}"() #1 + %sgpr15 = tail call i32 asm sideeffect "s_mov_b32 s15, 0", "={s15}"() #1 + %sgpr16 = tail call i32 asm sideeffect "s_mov_b32 s16, 0", "={s16}"() #1 + %sgpr17 = tail call i32 asm sideeffect "s_mov_b32 s17, 0", "={s17}"() #1 + %sgpr18 = tail call i32 asm sideeffect "s_mov_b32 s18, 0", "={s18}"() #1 + %sgpr19 = tail call i32 asm sideeffect "s_mov_b32 s19, 0", "={s19}"() #1 + %sgpr20 = tail call i32 asm sideeffect "s_mov_b32 s20, 0", "={s20}"() #1 + %sgpr21 = tail call i32 asm sideeffect "s_mov_b32 s21, 0", "={s21}"() #1 + %sgpr22 = tail call i32 asm sideeffect "s_mov_b32 s22, 0", "={s22}"() #1 + %sgpr23 = tail call i32 asm sideeffect "s_mov_b32 s23, 0", "={s23}"() #1 + %sgpr24 = tail call i32 asm sideeffect "s_mov_b32 s24, 0", "={s24}"() #1 + %sgpr25 = tail call i32 asm sideeffect "s_mov_b32 s25, 0", "={s25}"() #1 + %sgpr26 = tail call i32 asm sideeffect "s_mov_b32 s26, 0", "={s26}"() #1 + %sgpr27 = tail call i32 asm sideeffect "s_mov_b32 s27, 0", "={s27}"() #1 + %sgpr28 = tail call i32 asm sideeffect "s_mov_b32 s28, 0", "={s28}"() #1 + %sgpr29 = tail call i32 asm sideeffect "s_mov_b32 s29, 0", "={s29}"() #1 + %sgpr30 = tail call i32 asm sideeffect "s_mov_b32 s30, 0", "={s30}"() #1 + %sgpr31 = tail call i32 asm sideeffect "s_mov_b32 s31, 0", "={s31}"() #1 + %sgpr32 = tail call i32 asm sideeffect "s_mov_b32 s32, 0", "={s32}"() #1 + %sgpr33 = tail call i32 asm sideeffect "s_mov_b32 s33, 0", "={s33}"() #1 + %sgpr34 = tail call i32 asm sideeffect "s_mov_b32 s34, 0", "={s34}"() #1 + %sgpr35 = tail call i32 asm sideeffect "s_mov_b32 s35, 0", "={s35}"() #1 + %sgpr36 = tail call i32 asm sideeffect "s_mov_b32 s36, 0", "={s36}"() #1 + %sgpr37 = tail call i32 asm sideeffect "s_mov_b32 s37, 0", "={s37}"() #1 + %sgpr38 = tail call i32 asm sideeffect "s_mov_b32 s38, 0", "={s38}"() #1 + %sgpr39 = tail call i32 asm sideeffect "s_mov_b32 s39, 0", "={s39}"() #1 + %sgpr40 = tail call i32 asm sideeffect "s_mov_b32 s40, 0", "={s40}"() #1 + %sgpr41 = tail call i32 asm sideeffect "s_mov_b32 s41, 0", "={s41}"() #1 + %sgpr42 = tail call i32 asm sideeffect "s_mov_b32 s42, 0", "={s42}"() #1 + %sgpr43 = tail call i32 asm sideeffect "s_mov_b32 s43, 0", "={s43}"() #1 + %sgpr44 = tail call i32 asm sideeffect "s_mov_b32 s44, 0", "={s44}"() #1 + %sgpr45 = tail call i32 asm sideeffect "s_mov_b32 s45, 0", "={s45}"() #1 + %sgpr46 = tail call i32 asm sideeffect "s_mov_b32 s46, 0", "={s46}"() #1 + %sgpr47 = tail call i32 asm sideeffect "s_mov_b32 s47, 0", "={s47}"() #1 + %sgpr48 = tail call i32 asm sideeffect "s_mov_b32 s48, 0", "={s48}"() #1 + %sgpr49 = tail call i32 asm sideeffect "s_mov_b32 s49, 0", "={s49}"() #1 + %sgpr50 = tail call i32 asm sideeffect "s_mov_b32 s50, 0", "={s50}"() #1 + %sgpr51 = tail call i32 asm sideeffect "s_mov_b32 s51, 0", "={s51}"() #1 + %sgpr52 = tail call i32 asm sideeffect "s_mov_b32 s52, 0", "={s52}"() #1 + %sgpr53 = tail call i32 asm sideeffect "s_mov_b32 s53, 0", "={s53}"() #1 + %sgpr54 = tail call i32 asm sideeffect "s_mov_b32 s54, 0", "={s54}"() #1 + %sgpr55 = tail call i32 asm sideeffect "s_mov_b32 s55, 0", "={s55}"() #1 + %sgpr56 = tail call i32 asm sideeffect "s_mov_b32 s56, 0", "={s56}"() #1 + %sgpr57 = tail call i32 asm sideeffect "s_mov_b32 s57, 0", "={s57}"() #1 + %sgpr58 = tail call i32 asm sideeffect "s_mov_b32 s58, 0", "={s58}"() #1 + %sgpr59 = tail call i32 asm sideeffect "s_mov_b32 s59, 0", "={s59}"() #1 + %sgpr60 = tail call i32 asm sideeffect "s_mov_b32 s60, 0", "={s60}"() #1 + %sgpr61 = tail call i32 asm sideeffect "s_mov_b32 s61, 0", "={s61}"() #1 + %sgpr62 = tail call i32 asm sideeffect "s_mov_b32 s62, 0", "={s62}"() #1 + %sgpr63 = tail call i32 asm sideeffect "s_mov_b32 s63, 0", "={s63}"() #1 + %sgpr64 = tail call i32 asm sideeffect "s_mov_b32 s64, 0", "={s64}"() #1 + %sgpr65 = tail call i32 asm sideeffect "s_mov_b32 s65, 0", "={s65}"() #1 + %sgpr66 = tail call i32 asm sideeffect "s_mov_b32 s66, 0", "={s66}"() #1 + %sgpr67 = tail call i32 asm sideeffect "s_mov_b32 s67, 0", "={s67}"() #1 + %sgpr68 = tail call i32 asm sideeffect "s_mov_b32 s68, 0", "={s68}"() #1 + %sgpr69 = tail call i32 asm sideeffect "s_mov_b32 s69, 0", "={s69}"() #1 + %sgpr70 = tail call i32 asm sideeffect "s_mov_b32 s70, 0", "={s70}"() #1 + %sgpr71 = tail call i32 asm sideeffect "s_mov_b32 s71, 0", "={s71}"() #1 + %sgpr72 = tail call i32 asm sideeffect "s_mov_b32 s72, 0", "={s72}"() #1 + %sgpr73 = tail call i32 asm sideeffect "s_mov_b32 s73, 0", "={s73}"() #1 + %sgpr74 = tail call i32 asm sideeffect "s_mov_b32 s74, 0", "={s74}"() #1 + %sgpr75 = tail call i32 asm sideeffect "s_mov_b32 s75, 0", "={s75}"() #1 + %sgpr76 = tail call i32 asm sideeffect "s_mov_b32 s76, 0", "={s76}"() #1 + %sgpr77 = tail call i32 asm sideeffect "s_mov_b32 s77, 0", "={s77}"() #1 + %sgpr78 = tail call i32 asm sideeffect "s_mov_b32 s78, 0", "={s78}"() #1 + %sgpr79 = tail call i32 asm sideeffect "s_mov_b32 s79, 0", "={s79}"() #1 + %sgpr80 = tail call i32 asm sideeffect "s_mov_b32 s80, 0", "={s80}"() #1 + %sgpr81 = tail call i32 asm sideeffect "s_mov_b32 s81, 0", "={s81}"() #1 + %sgpr82 = tail call i32 asm sideeffect "s_mov_b32 s82, 0", "={s82}"() #1 + %sgpr83 = tail call i32 asm sideeffect "s_mov_b32 s83, 0", "={s83}"() #1 + %sgpr84 = tail call i32 asm sideeffect "s_mov_b32 s84, 0", "={s84}"() #1 + %sgpr85 = tail call i32 asm sideeffect "s_mov_b32 s85, 0", "={s85}"() #1 + %sgpr86 = tail call i32 asm sideeffect "s_mov_b32 s86, 0", "={s86}"() #1 + %sgpr87 = tail call i32 asm sideeffect "s_mov_b32 s87, 0", "={s87}"() #1 + %sgpr88 = tail call i32 asm sideeffect "s_mov_b32 s88, 0", "={s88}"() #1 + %sgpr89 = tail call i32 asm sideeffect "s_mov_b32 s89, 0", "={s89}"() #1 + %sgpr90 = tail call i32 asm sideeffect "s_mov_b32 s90, 0", "={s90}"() #1 + %sgpr91 = tail call i32 asm sideeffect "s_mov_b32 s91, 0", "={s91}"() #1 + %sgpr92 = tail call i32 asm sideeffect "s_mov_b32 s92, 0", "={s92}"() #1 + %sgpr93 = tail call i32 asm sideeffect "s_mov_b32 s93, 0", "={s93}"() #1 + %sgpr94 = tail call i32 asm sideeffect "s_mov_b32 s94, 0", "={s94}"() #1 + %sgpr95 = tail call i32 asm sideeffect "s_mov_b32 s95, 0", "={s95}"() #1 + %sgpr96 = tail call i32 asm sideeffect "s_mov_b32 s96, 0", "={s96}"() #1 + %sgpr97 = tail call i32 asm sideeffect "s_mov_b32 s97, 0", "={s97}"() #1 + %sgpr98 = tail call i32 asm sideeffect "s_mov_b32 s98, 0", "={s98}"() #1 + %sgpr99 = tail call i32 asm sideeffect "s_mov_b32 s99, 0", "={s99}"() #1 + %sgpr100 = tail call i32 asm sideeffect "s_mov_b32 s100, 0", "={s100}"() #1 + %sgpr101 = tail call i32 asm sideeffect "s_mov_b32 s101, 0", "={s101}"() #1 + %vcc_lo = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_lo}"() #1 + %vcc_hi = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_hi}"() #1 + %cmp = icmp ne i32 %cnd.load, 0 + br i1 %cmp, label %bb2, label %bb3, !amdgpu.uniform !0 + + bb2: ; preds = %entry + call void asm sideeffect "v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", ""() #1 + br label %bb3, !amdgpu.uniform !0 + + bb3: ; preds = %bb2, %entry + tail call void asm sideeffect "; reg use $0", "{s0}"(i32 %sgpr0) #1 + tail call void asm sideeffect "; reg use $0", "{s1}"(i32 %sgpr1) #1 + tail call void asm sideeffect "; reg use $0", "{s2}"(i32 %sgpr2) #1 + tail call void asm sideeffect "; reg use $0", "{s3}"(i32 %sgpr3) #1 + tail call void asm sideeffect "; reg use $0", "{s4}"(i32 %sgpr4) #1 + tail call void asm sideeffect "; reg use $0", "{s5}"(i32 %sgpr5) #1 + tail call void asm sideeffect "; reg use $0", "{s6}"(i32 %sgpr6) #1 + tail call void asm sideeffect "; reg use $0", "{s7}"(i32 %sgpr7) #1 + tail call void asm sideeffect "; reg use $0", "{s8}"(i32 %sgpr8) #1 + tail call void asm sideeffect "; reg use $0", "{s9}"(i32 %sgpr9) #1 + tail call void asm sideeffect "; reg use $0", "{s10}"(i32 %sgpr10) #1 + tail call void asm sideeffect "; reg use $0", "{s11}"(i32 %sgpr11) #1 + tail call void asm sideeffect "; reg use $0", "{s12}"(i32 %sgpr12) #1 + tail call void asm sideeffect "; reg use $0", "{s13}"(i32 %sgpr13) #1 + tail call void asm sideeffect "; reg use $0", "{s14}"(i32 %sgpr14) #1 + tail call void asm sideeffect "; reg use $0", "{s15}"(i32 %sgpr15) #1 + tail call void asm sideeffect "; reg use $0", "{s16}"(i32 %sgpr16) #1 + tail call void asm sideeffect "; reg use $0", "{s17}"(i32 %sgpr17) #1 + tail call void asm sideeffect "; reg use $0", "{s18}"(i32 %sgpr18) #1 + tail call void asm sideeffect "; reg use $0", "{s19}"(i32 %sgpr19) #1 + tail call void asm sideeffect "; reg use $0", "{s20}"(i32 %sgpr20) #1 + tail call void asm sideeffect "; reg use $0", "{s21}"(i32 %sgpr21) #1 + tail call void asm sideeffect "; reg use $0", "{s22}"(i32 %sgpr22) #1 + tail call void asm sideeffect "; reg use $0", "{s23}"(i32 %sgpr23) #1 + tail call void asm sideeffect "; reg use $0", "{s24}"(i32 %sgpr24) #1 + tail call void asm sideeffect "; reg use $0", "{s25}"(i32 %sgpr25) #1 + tail call void asm sideeffect "; reg use $0", "{s26}"(i32 %sgpr26) #1 + tail call void asm sideeffect "; reg use $0", "{s27}"(i32 %sgpr27) #1 + tail call void asm sideeffect "; reg use $0", "{s28}"(i32 %sgpr28) #1 + tail call void asm sideeffect "; reg use $0", "{s29}"(i32 %sgpr29) #1 + tail call void asm sideeffect "; reg use $0", "{s30}"(i32 %sgpr30) #1 + tail call void asm sideeffect "; reg use $0", "{s31}"(i32 %sgpr31) #1 + tail call void asm sideeffect "; reg use $0", "{s32}"(i32 %sgpr32) #1 + tail call void asm sideeffect "; reg use $0", "{s33}"(i32 %sgpr33) #1 + tail call void asm sideeffect "; reg use $0", "{s34}"(i32 %sgpr34) #1 + tail call void asm sideeffect "; reg use $0", "{s35}"(i32 %sgpr35) #1 + tail call void asm sideeffect "; reg use $0", "{s36}"(i32 %sgpr36) #1 + tail call void asm sideeffect "; reg use $0", "{s37}"(i32 %sgpr37) #1 + tail call void asm sideeffect "; reg use $0", "{s38}"(i32 %sgpr38) #1 + tail call void asm sideeffect "; reg use $0", "{s39}"(i32 %sgpr39) #1 + tail call void asm sideeffect "; reg use $0", "{s40}"(i32 %sgpr40) #1 + tail call void asm sideeffect "; reg use $0", "{s41}"(i32 %sgpr41) #1 + tail call void asm sideeffect "; reg use $0", "{s42}"(i32 %sgpr42) #1 + tail call void asm sideeffect "; reg use $0", "{s43}"(i32 %sgpr43) #1 + tail call void asm sideeffect "; reg use $0", "{s44}"(i32 %sgpr44) #1 + tail call void asm sideeffect "; reg use $0", "{s45}"(i32 %sgpr45) #1 + tail call void asm sideeffect "; reg use $0", "{s46}"(i32 %sgpr46) #1 + tail call void asm sideeffect "; reg use $0", "{s47}"(i32 %sgpr47) #1 + tail call void asm sideeffect "; reg use $0", "{s48}"(i32 %sgpr48) #1 + tail call void asm sideeffect "; reg use $0", "{s49}"(i32 %sgpr49) #1 + tail call void asm sideeffect "; reg use $0", "{s50}"(i32 %sgpr50) #1 + tail call void asm sideeffect "; reg use $0", "{s51}"(i32 %sgpr51) #1 + tail call void asm sideeffect "; reg use $0", "{s52}"(i32 %sgpr52) #1 + tail call void asm sideeffect "; reg use $0", "{s53}"(i32 %sgpr53) #1 + tail call void asm sideeffect "; reg use $0", "{s54}"(i32 %sgpr54) #1 + tail call void asm sideeffect "; reg use $0", "{s55}"(i32 %sgpr55) #1 + tail call void asm sideeffect "; reg use $0", "{s56}"(i32 %sgpr56) #1 + tail call void asm sideeffect "; reg use $0", "{s57}"(i32 %sgpr57) #1 + tail call void asm sideeffect "; reg use $0", "{s58}"(i32 %sgpr58) #1 + tail call void asm sideeffect "; reg use $0", "{s59}"(i32 %sgpr59) #1 + tail call void asm sideeffect "; reg use $0", "{s60}"(i32 %sgpr60) #1 + tail call void asm sideeffect "; reg use $0", "{s61}"(i32 %sgpr61) #1 + tail call void asm sideeffect "; reg use $0", "{s62}"(i32 %sgpr62) #1 + tail call void asm sideeffect "; reg use $0", "{s63}"(i32 %sgpr63) #1 + tail call void asm sideeffect "; reg use $0", "{s64}"(i32 %sgpr64) #1 + tail call void asm sideeffect "; reg use $0", "{s65}"(i32 %sgpr65) #1 + tail call void asm sideeffect "; reg use $0", "{s66}"(i32 %sgpr66) #1 + tail call void asm sideeffect "; reg use $0", "{s67}"(i32 %sgpr67) #1 + tail call void asm sideeffect "; reg use $0", "{s68}"(i32 %sgpr68) #1 + tail call void asm sideeffect "; reg use $0", "{s69}"(i32 %sgpr69) #1 + tail call void asm sideeffect "; reg use $0", "{s70}"(i32 %sgpr70) #1 + tail call void asm sideeffect "; reg use $0", "{s71}"(i32 %sgpr71) #1 + tail call void asm sideeffect "; reg use $0", "{s72}"(i32 %sgpr72) #1 + tail call void asm sideeffect "; reg use $0", "{s73}"(i32 %sgpr73) #1 + tail call void asm sideeffect "; reg use $0", "{s74}"(i32 %sgpr74) #1 + tail call void asm sideeffect "; reg use $0", "{s75}"(i32 %sgpr75) #1 + tail call void asm sideeffect "; reg use $0", "{s76}"(i32 %sgpr76) #1 + tail call void asm sideeffect "; reg use $0", "{s77}"(i32 %sgpr77) #1 + tail call void asm sideeffect "; reg use $0", "{s78}"(i32 %sgpr78) #1 + tail call void asm sideeffect "; reg use $0", "{s79}"(i32 %sgpr79) #1 + tail call void asm sideeffect "; reg use $0", "{s80}"(i32 %sgpr80) #1 + tail call void asm sideeffect "; reg use $0", "{s81}"(i32 %sgpr81) #1 + tail call void asm sideeffect "; reg use $0", "{s82}"(i32 %sgpr82) #1 + tail call void asm sideeffect "; reg use $0", "{s83}"(i32 %sgpr83) #1 + tail call void asm sideeffect "; reg use $0", "{s84}"(i32 %sgpr84) #1 + tail call void asm sideeffect "; reg use $0", "{s85}"(i32 %sgpr85) #1 + tail call void asm sideeffect "; reg use $0", "{s86}"(i32 %sgpr86) #1 + tail call void asm sideeffect "; reg use $0", "{s87}"(i32 %sgpr87) #1 + tail call void asm sideeffect "; reg use $0", "{s88}"(i32 %sgpr88) #1 + tail call void asm sideeffect "; reg use $0", "{s89}"(i32 %sgpr89) #1 + tail call void asm sideeffect "; reg use $0", "{s90}"(i32 %sgpr90) #1 + tail call void asm sideeffect "; reg use $0", "{s91}"(i32 %sgpr91) #1 + tail call void asm sideeffect "; reg use $0", "{s92}"(i32 %sgpr92) #1 + tail call void asm sideeffect "; reg use $0", "{s93}"(i32 %sgpr93) #1 + tail call void asm sideeffect "; reg use $0", "{s94}"(i32 %sgpr94) #1 + tail call void asm sideeffect "; reg use $0", "{s95}"(i32 %sgpr95) #1 + tail call void asm sideeffect "; reg use $0", "{s96}"(i32 %sgpr96) #1 + tail call void asm sideeffect "; reg use $0", "{s97}"(i32 %sgpr97) #1 + tail call void asm sideeffect "; reg use $0", "{s98}"(i32 %sgpr98) #1 + tail call void asm sideeffect "; reg use $0", "{s99}"(i32 %sgpr99) #1 + tail call void asm sideeffect "; reg use $0", "{s100}"(i32 %sgpr100) #1 + tail call void asm sideeffect "; reg use $0", "{s101}"(i32 %sgpr101) #1 + tail call void asm sideeffect "; reg use $0", "{vcc_lo}"(i32 %vcc_lo) #1 + tail call void asm sideeffect "; reg use $0", "{vcc_hi}"(i32 %vcc_hi) #1 + ret void + } + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +; CHECK-LABEL: {{^}}name: long_branch_high_num_sgprs_used +; CHECK: machineFunctionInfo: +; CHECK-NEXT: explicitKernArgSize: 12 +; CHECK-NEXT: maxKernArgAlign: 8 +; CHECK-NEXT: ldsSize: 0 +; CHECK-NEXT: gdsSize: 0 +; CHECK-NEXT: dynLDSAlign: 1 +; CHECK-NEXT: isEntryFunction: true +; CHECK-NEXT: noSignedZerosFPMath: false +; CHECK-NEXT: memoryBound: false +; CHECK-NEXT: waveLimiter: false +; CHECK-NEXT: hasSpilledSGPRs: false +; CHECK-NEXT: hasSpilledVGPRs: false +; CHECK-NEXT: scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' +; CHECK-NEXT: frameOffsetReg: '$fp_reg' +; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32' +; CHECK-NEXT: bytesInStackArgArea: 0 +; CHECK-NEXT: returnsVoid: true +; CHECK-NEXT: argumentInfo: +; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr0_sgpr1' } +; CHECK-NEXT: workGroupIDX: { reg: '$sgpr2' } +; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr3' } +; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' } +; CHECK-NEXT: psInputAddr: 0 +; CHECK-NEXT: psInputEnable: 0 +; CHECK-NEXT: mode: +; CHECK-NEXT: ieee: true +; CHECK-NEXT: dx10-clamp: true +; CHECK-NEXT: fp32-input-denormals: true +; CHECK-NEXT: fp32-output-denormals: true +; CHECK-NEXT: fp64-fp16-input-denormals: true +; CHECK-NEXT: fp64-fp16-output-denormals: true +; CHECK-NEXT: highBitsOf32BitAddress: 0 +; CHECK-NEXT: occupancy: 5 +; CHECK-NEXT: vgprForAGPRCopy: '' +; CHECK-NEXT: longBranchReservedReg: '$sgpr100_sgpr101' +; CHECK-NEXT: body: + define amdgpu_kernel void @long_branch_high_num_sgprs_used(ptr addrspace(1) %arg, i32 %cnd) #0 { + entry: + %long_branch_used_all_sgprs.kernarg.segment = call nonnull align 16 dereferenceable(48) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() + %cnd.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %long_branch_used_all_sgprs.kernarg.segment, i64 44, !amdgpu.uniform !0 + %cnd.load = load i32, ptr addrspace(4) %cnd.kernarg.offset, align 4, !invariant.load !0 + %sgpr0 = tail call i32 asm sideeffect "s_mov_b32 s0, 0", "={s0}"() #1 + %sgpr1 = tail call i32 asm sideeffect "s_mov_b32 s1, 0", "={s1}"() #1 + %sgpr2 = tail call i32 asm sideeffect "s_mov_b32 s2, 0", "={s2}"() #1 + %sgpr3 = tail call i32 asm sideeffect "s_mov_b32 s3, 0", "={s3}"() #1 + %sgpr4 = tail call i32 asm sideeffect "s_mov_b32 s4, 0", "={s4}"() #1 + %sgpr5 = tail call i32 asm sideeffect "s_mov_b32 s5, 0", "={s5}"() #1 + %sgpr6 = tail call i32 asm sideeffect "s_mov_b32 s6, 0", "={s6}"() #1 + %sgpr7 = tail call i32 asm sideeffect "s_mov_b32 s7, 0", "={s7}"() #1 + %sgpr8 = tail call i32 asm sideeffect "s_mov_b32 s8, 0", "={s8}"() #1 + %sgpr9 = tail call i32 asm sideeffect "s_mov_b32 s9, 0", "={s9}"() #1 + %sgpr10 = tail call i32 asm sideeffect "s_mov_b32 s10, 0", "={s10}"() #1 + %sgpr11 = tail call i32 asm sideeffect "s_mov_b32 s11, 0", "={s11}"() #1 + %sgpr12 = tail call i32 asm sideeffect "s_mov_b32 s12, 0", "={s12}"() #1 + %sgpr13 = tail call i32 asm sideeffect "s_mov_b32 s13, 0", "={s13}"() #1 + %sgpr14 = tail call i32 asm sideeffect "s_mov_b32 s14, 0", "={s14}"() #1 + %sgpr15 = tail call i32 asm sideeffect "s_mov_b32 s15, 0", "={s15}"() #1 + %sgpr16 = tail call i32 asm sideeffect "s_mov_b32 s16, 0", "={s16}"() #1 + %sgpr17 = tail call i32 asm sideeffect "s_mov_b32 s17, 0", "={s17}"() #1 + %sgpr18 = tail call i32 asm sideeffect "s_mov_b32 s18, 0", "={s18}"() #1 + %sgpr19 = tail call i32 asm sideeffect "s_mov_b32 s19, 0", "={s19}"() #1 + %sgpr20 = tail call i32 asm sideeffect "s_mov_b32 s20, 0", "={s20}"() #1 + %sgpr21 = tail call i32 asm sideeffect "s_mov_b32 s21, 0", "={s21}"() #1 + %sgpr22 = tail call i32 asm sideeffect "s_mov_b32 s22, 0", "={s22}"() #1 + %sgpr23 = tail call i32 asm sideeffect "s_mov_b32 s23, 0", "={s23}"() #1 + %sgpr24 = tail call i32 asm sideeffect "s_mov_b32 s24, 0", "={s24}"() #1 + %sgpr25 = tail call i32 asm sideeffect "s_mov_b32 s25, 0", "={s25}"() #1 + %sgpr26 = tail call i32 asm sideeffect "s_mov_b32 s26, 0", "={s26}"() #1 + %sgpr27 = tail call i32 asm sideeffect "s_mov_b32 s27, 0", "={s27}"() #1 + %sgpr28 = tail call i32 asm sideeffect "s_mov_b32 s28, 0", "={s28}"() #1 + %sgpr29 = tail call i32 asm sideeffect "s_mov_b32 s29, 0", "={s29}"() #1 + %sgpr30 = tail call i32 asm sideeffect "s_mov_b32 s30, 0", "={s30}"() #1 + %sgpr31 = tail call i32 asm sideeffect "s_mov_b32 s31, 0", "={s31}"() #1 + %sgpr32 = tail call i32 asm sideeffect "s_mov_b32 s32, 0", "={s32}"() #1 + %sgpr33 = tail call i32 asm sideeffect "s_mov_b32 s33, 0", "={s33}"() #1 + %sgpr34 = tail call i32 asm sideeffect "s_mov_b32 s34, 0", "={s34}"() #1 + %sgpr35 = tail call i32 asm sideeffect "s_mov_b32 s35, 0", "={s35}"() #1 + %sgpr36 = tail call i32 asm sideeffect "s_mov_b32 s36, 0", "={s36}"() #1 + %sgpr37 = tail call i32 asm sideeffect "s_mov_b32 s37, 0", "={s37}"() #1 + %sgpr38 = tail call i32 asm sideeffect "s_mov_b32 s38, 0", "={s38}"() #1 + %sgpr39 = tail call i32 asm sideeffect "s_mov_b32 s39, 0", "={s39}"() #1 + %sgpr40 = tail call i32 asm sideeffect "s_mov_b32 s40, 0", "={s40}"() #1 + %sgpr41 = tail call i32 asm sideeffect "s_mov_b32 s41, 0", "={s41}"() #1 + %sgpr42 = tail call i32 asm sideeffect "s_mov_b32 s42, 0", "={s42}"() #1 + %sgpr43 = tail call i32 asm sideeffect "s_mov_b32 s43, 0", "={s43}"() #1 + %sgpr44 = tail call i32 asm sideeffect "s_mov_b32 s44, 0", "={s44}"() #1 + %sgpr45 = tail call i32 asm sideeffect "s_mov_b32 s45, 0", "={s45}"() #1 + %sgpr46 = tail call i32 asm sideeffect "s_mov_b32 s46, 0", "={s46}"() #1 + %sgpr47 = tail call i32 asm sideeffect "s_mov_b32 s47, 0", "={s47}"() #1 + %sgpr48 = tail call i32 asm sideeffect "s_mov_b32 s48, 0", "={s48}"() #1 + %sgpr49 = tail call i32 asm sideeffect "s_mov_b32 s49, 0", "={s49}"() #1 + %sgpr50 = tail call i32 asm sideeffect "s_mov_b32 s50, 0", "={s50}"() #1 + %sgpr51 = tail call i32 asm sideeffect "s_mov_b32 s51, 0", "={s51}"() #1 + %sgpr52 = tail call i32 asm sideeffect "s_mov_b32 s52, 0", "={s52}"() #1 + %sgpr53 = tail call i32 asm sideeffect "s_mov_b32 s53, 0", "={s53}"() #1 + %sgpr54 = tail call i32 asm sideeffect "s_mov_b32 s54, 0", "={s54}"() #1 + %sgpr55 = tail call i32 asm sideeffect "s_mov_b32 s55, 0", "={s55}"() #1 + %sgpr56 = tail call i32 asm sideeffect "s_mov_b32 s56, 0", "={s56}"() #1 + %sgpr57 = tail call i32 asm sideeffect "s_mov_b32 s57, 0", "={s57}"() #1 + %sgpr58 = tail call i32 asm sideeffect "s_mov_b32 s58, 0", "={s58}"() #1 + %sgpr59 = tail call i32 asm sideeffect "s_mov_b32 s59, 0", "={s59}"() #1 + %sgpr60 = tail call i32 asm sideeffect "s_mov_b32 s60, 0", "={s60}"() #1 + %sgpr61 = tail call i32 asm sideeffect "s_mov_b32 s61, 0", "={s61}"() #1 + %sgpr62 = tail call i32 asm sideeffect "s_mov_b32 s62, 0", "={s62}"() #1 + %sgpr63 = tail call i32 asm sideeffect "s_mov_b32 s63, 0", "={s63}"() #1 + %sgpr64 = tail call i32 asm sideeffect "s_mov_b32 s64, 0", "={s64}"() #1 + %sgpr65 = tail call i32 asm sideeffect "s_mov_b32 s65, 0", "={s65}"() #1 + %sgpr66 = tail call i32 asm sideeffect "s_mov_b32 s66, 0", "={s66}"() #1 + %sgpr67 = tail call i32 asm sideeffect "s_mov_b32 s67, 0", "={s67}"() #1 + %sgpr68 = tail call i32 asm sideeffect "s_mov_b32 s68, 0", "={s68}"() #1 + %sgpr69 = tail call i32 asm sideeffect "s_mov_b32 s69, 0", "={s69}"() #1 + %sgpr70 = tail call i32 asm sideeffect "s_mov_b32 s70, 0", "={s70}"() #1 + %sgpr71 = tail call i32 asm sideeffect "s_mov_b32 s71, 0", "={s71}"() #1 + %sgpr72 = tail call i32 asm sideeffect "s_mov_b32 s72, 0", "={s72}"() #1 + %sgpr73 = tail call i32 asm sideeffect "s_mov_b32 s73, 0", "={s73}"() #1 + %sgpr74 = tail call i32 asm sideeffect "s_mov_b32 s74, 0", "={s74}"() #1 + %sgpr75 = tail call i32 asm sideeffect "s_mov_b32 s75, 0", "={s75}"() #1 + %sgpr76 = tail call i32 asm sideeffect "s_mov_b32 s76, 0", "={s76}"() #1 + %sgpr77 = tail call i32 asm sideeffect "s_mov_b32 s77, 0", "={s77}"() #1 + %sgpr78 = tail call i32 asm sideeffect "s_mov_b32 s78, 0", "={s78}"() #1 + %sgpr79 = tail call i32 asm sideeffect "s_mov_b32 s79, 0", "={s79}"() #1 + %sgpr80 = tail call i32 asm sideeffect "s_mov_b32 s80, 0", "={s80}"() #1 + %sgpr81 = tail call i32 asm sideeffect "s_mov_b32 s81, 0", "={s81}"() #1 + %sgpr82 = tail call i32 asm sideeffect "s_mov_b32 s82, 0", "={s82}"() #1 + %sgpr83 = tail call i32 asm sideeffect "s_mov_b32 s83, 0", "={s83}"() #1 + %sgpr84 = tail call i32 asm sideeffect "s_mov_b32 s84, 0", "={s84}"() #1 + %sgpr85 = tail call i32 asm sideeffect "s_mov_b32 s85, 0", "={s85}"() #1 + %sgpr86 = tail call i32 asm sideeffect "s_mov_b32 s86, 0", "={s86}"() #1 + %sgpr87 = tail call i32 asm sideeffect "s_mov_b32 s87, 0", "={s87}"() #1 + %sgpr88 = tail call i32 asm sideeffect "s_mov_b32 s88, 0", "={s88}"() #1 + %sgpr89 = tail call i32 asm sideeffect "s_mov_b32 s89, 0", "={s89}"() #1 + %sgpr90 = tail call i32 asm sideeffect "s_mov_b32 s90, 0", "={s90}"() #1 + %sgpr91 = tail call i32 asm sideeffect "s_mov_b32 s91, 0", "={s91}"() #1 + %sgpr92 = tail call i32 asm sideeffect "s_mov_b32 s92, 0", "={s92}"() #1 + %sgpr93 = tail call i32 asm sideeffect "s_mov_b32 s93, 0", "={s93}"() #1 + %sgpr94 = tail call i32 asm sideeffect "s_mov_b32 s94, 0", "={s94}"() #1 + %sgpr95 = tail call i32 asm sideeffect "s_mov_b32 s95, 0", "={s95}"() #1 + %sgpr96 = tail call i32 asm sideeffect "s_mov_b32 s96, 0", "={s96}"() #1 + %sgpr97 = tail call i32 asm sideeffect "s_mov_b32 s97, 0", "={s97}"() #1 + %sgpr98 = tail call i32 asm sideeffect "s_mov_b32 s98, 0", "={s98}"() #1 + %sgpr99 = tail call i32 asm sideeffect "s_mov_b32 s99, 0", "={s99}"() #1 + %vcc_lo = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_lo}"() #1 + %vcc_hi = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_hi}"() #1 + %cmp = icmp ne i32 %cnd.load, 0 + br i1 %cmp, label %bb2, label %bb3, !amdgpu.uniform !0 + + bb2: ; preds = %entry + call void asm sideeffect "v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", ""() #1 + br label %bb3, !amdgpu.uniform !0 + + bb3: ; preds = %bb2, %entry + tail call void asm sideeffect "; reg use $0", "{s0}"(i32 %sgpr0) #1 + tail call void asm sideeffect "; reg use $0", "{s1}"(i32 %sgpr1) #1 + tail call void asm sideeffect "; reg use $0", "{s2}"(i32 %sgpr2) #1 + tail call void asm sideeffect "; reg use $0", "{s3}"(i32 %sgpr3) #1 + tail call void asm sideeffect "; reg use $0", "{s4}"(i32 %sgpr4) #1 + tail call void asm sideeffect "; reg use $0", "{s5}"(i32 %sgpr5) #1 + tail call void asm sideeffect "; reg use $0", "{s6}"(i32 %sgpr6) #1 + tail call void asm sideeffect "; reg use $0", "{s7}"(i32 %sgpr7) #1 + tail call void asm sideeffect "; reg use $0", "{s8}"(i32 %sgpr8) #1 + tail call void asm sideeffect "; reg use $0", "{s9}"(i32 %sgpr9) #1 + tail call void asm sideeffect "; reg use $0", "{s10}"(i32 %sgpr10) #1 + tail call void asm sideeffect "; reg use $0", "{s11}"(i32 %sgpr11) #1 + tail call void asm sideeffect "; reg use $0", "{s12}"(i32 %sgpr12) #1 + tail call void asm sideeffect "; reg use $0", "{s13}"(i32 %sgpr13) #1 + tail call void asm sideeffect "; reg use $0", "{s14}"(i32 %sgpr14) #1 + tail call void asm sideeffect "; reg use $0", "{s15}"(i32 %sgpr15) #1 + tail call void asm sideeffect "; reg use $0", "{s16}"(i32 %sgpr16) #1 + tail call void asm sideeffect "; reg use $0", "{s17}"(i32 %sgpr17) #1 + tail call void asm sideeffect "; reg use $0", "{s18}"(i32 %sgpr18) #1 + tail call void asm sideeffect "; reg use $0", "{s19}"(i32 %sgpr19) #1 + tail call void asm sideeffect "; reg use $0", "{s20}"(i32 %sgpr20) #1 + tail call void asm sideeffect "; reg use $0", "{s21}"(i32 %sgpr21) #1 + tail call void asm sideeffect "; reg use $0", "{s22}"(i32 %sgpr22) #1 + tail call void asm sideeffect "; reg use $0", "{s23}"(i32 %sgpr23) #1 + tail call void asm sideeffect "; reg use $0", "{s24}"(i32 %sgpr24) #1 + tail call void asm sideeffect "; reg use $0", "{s25}"(i32 %sgpr25) #1 + tail call void asm sideeffect "; reg use $0", "{s26}"(i32 %sgpr26) #1 + tail call void asm sideeffect "; reg use $0", "{s27}"(i32 %sgpr27) #1 + tail call void asm sideeffect "; reg use $0", "{s28}"(i32 %sgpr28) #1 + tail call void asm sideeffect "; reg use $0", "{s29}"(i32 %sgpr29) #1 + tail call void asm sideeffect "; reg use $0", "{s30}"(i32 %sgpr30) #1 + tail call void asm sideeffect "; reg use $0", "{s31}"(i32 %sgpr31) #1 + tail call void asm sideeffect "; reg use $0", "{s32}"(i32 %sgpr32) #1 + tail call void asm sideeffect "; reg use $0", "{s33}"(i32 %sgpr33) #1 + tail call void asm sideeffect "; reg use $0", "{s34}"(i32 %sgpr34) #1 + tail call void asm sideeffect "; reg use $0", "{s35}"(i32 %sgpr35) #1 + tail call void asm sideeffect "; reg use $0", "{s36}"(i32 %sgpr36) #1 + tail call void asm sideeffect "; reg use $0", "{s37}"(i32 %sgpr37) #1 + tail call void asm sideeffect "; reg use $0", "{s38}"(i32 %sgpr38) #1 + tail call void asm sideeffect "; reg use $0", "{s39}"(i32 %sgpr39) #1 + tail call void asm sideeffect "; reg use $0", "{s40}"(i32 %sgpr40) #1 + tail call void asm sideeffect "; reg use $0", "{s41}"(i32 %sgpr41) #1 + tail call void asm sideeffect "; reg use $0", "{s42}"(i32 %sgpr42) #1 + tail call void asm sideeffect "; reg use $0", "{s43}"(i32 %sgpr43) #1 + tail call void asm sideeffect "; reg use $0", "{s44}"(i32 %sgpr44) #1 + tail call void asm sideeffect "; reg use $0", "{s45}"(i32 %sgpr45) #1 + tail call void asm sideeffect "; reg use $0", "{s46}"(i32 %sgpr46) #1 + tail call void asm sideeffect "; reg use $0", "{s47}"(i32 %sgpr47) #1 + tail call void asm sideeffect "; reg use $0", "{s48}"(i32 %sgpr48) #1 + tail call void asm sideeffect "; reg use $0", "{s49}"(i32 %sgpr49) #1 + tail call void asm sideeffect "; reg use $0", "{s50}"(i32 %sgpr50) #1 + tail call void asm sideeffect "; reg use $0", "{s51}"(i32 %sgpr51) #1 + tail call void asm sideeffect "; reg use $0", "{s52}"(i32 %sgpr52) #1 + tail call void asm sideeffect "; reg use $0", "{s53}"(i32 %sgpr53) #1 + tail call void asm sideeffect "; reg use $0", "{s54}"(i32 %sgpr54) #1 + tail call void asm sideeffect "; reg use $0", "{s55}"(i32 %sgpr55) #1 + tail call void asm sideeffect "; reg use $0", "{s56}"(i32 %sgpr56) #1 + tail call void asm sideeffect "; reg use $0", "{s57}"(i32 %sgpr57) #1 + tail call void asm sideeffect "; reg use $0", "{s58}"(i32 %sgpr58) #1 + tail call void asm sideeffect "; reg use $0", "{s59}"(i32 %sgpr59) #1 + tail call void asm sideeffect "; reg use $0", "{s60}"(i32 %sgpr60) #1 + tail call void asm sideeffect "; reg use $0", "{s61}"(i32 %sgpr61) #1 + tail call void asm sideeffect "; reg use $0", "{s62}"(i32 %sgpr62) #1 + tail call void asm sideeffect "; reg use $0", "{s63}"(i32 %sgpr63) #1 + tail call void asm sideeffect "; reg use $0", "{s64}"(i32 %sgpr64) #1 + tail call void asm sideeffect "; reg use $0", "{s65}"(i32 %sgpr65) #1 + tail call void asm sideeffect "; reg use $0", "{s66}"(i32 %sgpr66) #1 + tail call void asm sideeffect "; reg use $0", "{s67}"(i32 %sgpr67) #1 + tail call void asm sideeffect "; reg use $0", "{s68}"(i32 %sgpr68) #1 + tail call void asm sideeffect "; reg use $0", "{s69}"(i32 %sgpr69) #1 + tail call void asm sideeffect "; reg use $0", "{s70}"(i32 %sgpr70) #1 + tail call void asm sideeffect "; reg use $0", "{s71}"(i32 %sgpr71) #1 + tail call void asm sideeffect "; reg use $0", "{s72}"(i32 %sgpr72) #1 + tail call void asm sideeffect "; reg use $0", "{s73}"(i32 %sgpr73) #1 + tail call void asm sideeffect "; reg use $0", "{s74}"(i32 %sgpr74) #1 + tail call void asm sideeffect "; reg use $0", "{s75}"(i32 %sgpr75) #1 + tail call void asm sideeffect "; reg use $0", "{s76}"(i32 %sgpr76) #1 + tail call void asm sideeffect "; reg use $0", "{s77}"(i32 %sgpr77) #1 + tail call void asm sideeffect "; reg use $0", "{s78}"(i32 %sgpr78) #1 + tail call void asm sideeffect "; reg use $0", "{s79}"(i32 %sgpr79) #1 + tail call void asm sideeffect "; reg use $0", "{s80}"(i32 %sgpr80) #1 + tail call void asm sideeffect "; reg use $0", "{s81}"(i32 %sgpr81) #1 + tail call void asm sideeffect "; reg use $0", "{s82}"(i32 %sgpr82) #1 + tail call void asm sideeffect "; reg use $0", "{s83}"(i32 %sgpr83) #1 + tail call void asm sideeffect "; reg use $0", "{s84}"(i32 %sgpr84) #1 + tail call void asm sideeffect "; reg use $0", "{s85}"(i32 %sgpr85) #1 + tail call void asm sideeffect "; reg use $0", "{s86}"(i32 %sgpr86) #1 + tail call void asm sideeffect "; reg use $0", "{s87}"(i32 %sgpr87) #1 + tail call void asm sideeffect "; reg use $0", "{s88}"(i32 %sgpr88) #1 + tail call void asm sideeffect "; reg use $0", "{s89}"(i32 %sgpr89) #1 + tail call void asm sideeffect "; reg use $0", "{s90}"(i32 %sgpr90) #1 + tail call void asm sideeffect "; reg use $0", "{s91}"(i32 %sgpr91) #1 + tail call void asm sideeffect "; reg use $0", "{s92}"(i32 %sgpr92) #1 + tail call void asm sideeffect "; reg use $0", "{s93}"(i32 %sgpr93) #1 + tail call void asm sideeffect "; reg use $0", "{s94}"(i32 %sgpr94) #1 + tail call void asm sideeffect "; reg use $0", "{s95}"(i32 %sgpr95) #1 + tail call void asm sideeffect "; reg use $0", "{s96}"(i32 %sgpr96) #1 + tail call void asm sideeffect "; reg use $0", "{s97}"(i32 %sgpr97) #1 + tail call void asm sideeffect "; reg use $0", "{s98}"(i32 %sgpr98) #1 + tail call void asm sideeffect "; reg use $0", "{s99}"(i32 %sgpr99) #1 + tail call void asm sideeffect "; reg use $0", "{vcc_lo}"(i32 %vcc_lo) #1 + tail call void asm sideeffect "; reg use $0", "{vcc_hi}"(i32 %vcc_hi) #1 + ret void + } +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare align 4 ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #2 +attributes #1 = { nounwind } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +!0 = !{} diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll @@ -39,6 +39,7 @@ ; AFTER-PEI-NEXT: occupancy: 5 ; AFTER-PEI-NEXT: scavengeFI: '%fixed-stack.0' ; AFTER-PEI-NEXT: vgprForAGPRCopy: '' +; AFTER-PEI-NEXT: longBranchReservedReg: '' ; AFTER-PEI-NEXT: body: define amdgpu_kernel void @scavenge_fi(ptr addrspace(1) %out, i32 %in) #0 { %wide.sgpr0 = call <32 x i32> asm sideeffect "; def $0", "=s" () #0 diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll @@ -0,0 +1,120 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -amdgpu-s-branch-bits=4 -stop-after=branch-relaxation -verify-machineinstrs %s -o - | FileCheck %s + +; Test that debug instructions do not change long branch reserved serialized through +; MIR. + +; CHECK-LABEL: {{^}}name: uniform_long_forward_branch_debug +; CHECK: machineFunctionInfo: +; CHECK-NEXT: explicitKernArgSize: 12 +; CHECK-NEXT: maxKernArgAlign: 8 +; CHECK-NEXT: ldsSize: 0 +; CHECK-NEXT: gdsSize: 0 +; CHECK-NEXT: dynLDSAlign: 1 +; CHECK-NEXT: isEntryFunction: true +; CHECK-NEXT: noSignedZerosFPMath: false +; CHECK-NEXT: memoryBound: false +; CHECK-NEXT: waveLimiter: false +; CHECK-NEXT: hasSpilledSGPRs: false +; CHECK-NEXT: hasSpilledVGPRs: false +; CHECK-NEXT: scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' +; CHECK-NEXT: frameOffsetReg: '$fp_reg' +; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32' +; CHECK-NEXT: bytesInStackArgArea: 0 +; CHECK-NEXT: returnsVoid: true +; CHECK-NEXT: argumentInfo: +; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; CHECK-NEXT: workGroupIDX: { reg: '$sgpr6' } +; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' } +; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' } +; CHECK-NEXT: psInputAddr: 0 +; CHECK-NEXT: psInputEnable: 0 +; CHECK-NEXT: mode: +; CHECK-NEXT: ieee: true +; CHECK-NEXT: dx10-clamp: true +; CHECK-NEXT: fp32-input-denormals: true +; CHECK-NEXT: fp32-output-denormals: true +; CHECK-NEXT: fp64-fp16-input-denormals: true +; CHECK-NEXT: fp64-fp16-output-denormals: true +; CHECK-NEXT: BitsOf32BitAddress: 0 +; CHECK-NEXT: occupancy: 8 +; CHECK-NEXT: vgprForAGPRCopy: '' +; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3' +; CHECK-NEXT: body: + define amdgpu_kernel void @uniform_long_forward_branch_debug(ptr addrspace(1) %arg, i32 %arg1) #0 !dbg !5 { + bb0: + %uniform_long_forward_branch_debug.kernarg.segment = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr(), !dbg !11 + %arg1.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %uniform_long_forward_branch_debug.kernarg.segment, i64 8, !dbg !11, !amdgpu.uniform !7 + %arg1.load = load i32, ptr addrspace(4) %arg1.kernarg.offset, align 8, !dbg !11, !invariant.load !7 + %tmp = icmp eq i32 %arg1.load, 0, !dbg !11 + call void @llvm.dbg.value(metadata i1 %tmp, metadata !9, metadata !DIExpression()), !dbg !11 + br i1 %tmp, label %bb3, label %Flow, !dbg !12, !amdgpu.uniform !7 + + Flow: ; preds = %bb3, %bb0 + %0 = phi i1 [ false, %bb3 ], [ true, %bb0 ], !dbg !12 + br i1 %0, label %bb2, label %bb4, !dbg !12, !amdgpu.uniform !7 + + bb2: ; preds = %Flow + store volatile i32 17, ptr addrspace(1) undef, align 4, !dbg !13 + br label %bb4, !dbg !14, !amdgpu.uniform !7 + + bb3: ; preds = %bb0 + call void asm sideeffect "v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", ""(), !dbg !15 + br label %Flow, !dbg !16, !amdgpu.uniform !7 + + bb4: ; preds = %bb2, %Flow + %arg.kernarg.offset1 = bitcast ptr addrspace(4) %uniform_long_forward_branch_debug.kernarg.segment to ptr addrspace(4), !dbg !11, !amdgpu.uniform !7 + %arg.load = load ptr addrspace(1), ptr addrspace(4) %arg.kernarg.offset1, align 16, !dbg !11, !invariant.load !7 + store volatile i32 63, ptr addrspace(1) %arg.load, align 4, !dbg !17 + ret void, !dbg !18 + } + + ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) + declare void @llvm.dbg.value(metadata, metadata, metadata) #1 + + ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) + declare align 4 ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #1 + + ; Function Attrs: convergent nocallback nofree nounwind willreturn + declare { i1, i64 } @llvm.amdgcn.if.i64(i1) #2 + + ; Function Attrs: convergent nocallback nofree nounwind willreturn + declare { i1, i64 } @llvm.amdgcn.else.i64.i64(i64) #2 + + ; Function Attrs: convergent nocallback nofree nounwind willreturn memory(none) + declare i64 @llvm.amdgcn.if.break.i64(i1, i64) #3 + + ; Function Attrs: convergent nocallback nofree nounwind willreturn + declare i1 @llvm.amdgcn.loop.i64(i64) #2 + + ; Function Attrs: convergent nocallback nofree nounwind willreturn + declare void @llvm.amdgcn.end.cf.i64(i64) #2 + + attributes #0 = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } + attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } + attributes #2 = { convergent nocallback nofree nounwind willreturn } + attributes #3 = { convergent nocallback nofree nounwind willreturn memory(none) } + + !llvm.dbg.cu = !{!0} + !llvm.debugify = !{!2, !3} + !llvm.module.flags = !{!4} + + !0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) + !1 = !DIFile(filename: "temp.ll", directory: "/") + !2 = !{i32 8} + !3 = !{i32 1} + !4 = !{i32 2, !"Debug Info Version", i32 3} + !5 = distinct !DISubprogram(name: "uniform_long_forward_branch_debug", linkageName: "uniform_long_forward_branch_debug", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8) + !6 = !DISubroutineType(types: !7) + !7 = !{} + !8 = !{!9} + !9 = !DILocalVariable(name: "1", scope: !5, file: !1, line: 1, type: !10) + !10 = !DIBasicType(name: "ty8", size: 8, encoding: DW_ATE_unsigned) + !11 = !DILocation(line: 1, column: 1, scope: !5) + !12 = !DILocation(line: 2, column: 1, scope: !5) + !13 = !DILocation(line: 3, column: 1, scope: !5) + !14 = !DILocation(line: 4, column: 1, scope: !5) + !15 = !DILocation(line: 5, column: 1, scope: !5) + !16 = !DILocation(line: 6, column: 1, scope: !5) + !17 = !DILocation(line: 7, column: 1, scope: !5) + !18 = !DILocation(line: 8, column: 1, scope: !5) diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll @@ -0,0 +1,68 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -amdgpu-s-branch-bits=4 -stop-after=branch-relaxation -verify-machineinstrs %s -o - | FileCheck %s + +; Test that long branch reserved register is serialized through +; MIR. + +; CHECK-LABEL: {{^}}name: uniform_long_forward_branch +; CHECK: machineFunctionInfo: +; CHECK-NEXT: explicitKernArgSize: 12 +; CHECK-NEXT: maxKernArgAlign: 8 +; CHECK-NEXT: ldsSize: 0 +; CHECK-NEXT: gdsSize: 0 +; CHECK-NEXT: dynLDSAlign: 1 +; CHECK-NEXT: isEntryFunction: true +; CHECK-NEXT: noSignedZerosFPMath: false +; CHECK-NEXT: memoryBound: false +; CHECK-NEXT: waveLimiter: false +; CHECK-NEXT: hasSpilledSGPRs: false +; CHECK-NEXT: hasSpilledVGPRs: false +; CHECK-NEXT: scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' +; CHECK-NEXT: frameOffsetReg: '$fp_reg' +; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32' +; CHECK-NEXT: bytesInStackArgArea: 0 +; CHECK-NEXT: returnsVoid: true +; CHECK-NEXT: argumentInfo: +; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; CHECK-NEXT: workGroupIDX: { reg: '$sgpr6' } +; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' } +; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' } +; CHECK-NEXT: psInputAddr: 0 +; CHECK-NEXT: psInputEnable: 0 +; CHECK-NEXT: mode: +; CHECK-NEXT: ieee: true +; CHECK-NEXT: dx10-clamp: true +; CHECK-NEXT: fp32-input-denormals: true +; CHECK-NEXT: fp32-output-denormals: true +; CHECK-NEXT: fp64-fp16-input-denormals: true +; CHECK-NEXT: fp64-fp16-output-denormals: true +; CHECK-NEXT: BitsOf32BitAddress: 0 +; CHECK-NEXT: occupancy: 8 +; CHECK-NEXT: vgprForAGPRCopy: '' +; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3' +; CHECK-NEXT: body: +define amdgpu_kernel void @uniform_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) { +bb0: + %tmp = icmp ne i32 %arg1, 0 + br i1 %tmp, label %bb2, label %bb3 + +bb2: + store volatile i32 17, ptr addrspace(1) undef + br label %bb4 + +bb3: + ; 32 byte asm + call void asm sideeffect + "v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64", ""() #0 + br label %bb4 + +bb4: + store volatile i32 63, ptr addrspace(1) %arg + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir @@ -48,6 +48,7 @@ # FULL-NEXT: highBitsOf32BitAddress: 0 # FULL-NEXT: occupancy: 8 # FULL-NEXT: vgprForAGPRCopy: '' +# FULL-NEXT: longBranchReservedReg: '' # FULL-NEXT: body: # SIMPLE: machineFunctionInfo: @@ -148,6 +149,7 @@ # FULL-NEXT: highBitsOf32BitAddress: 0 # FULL-NEXT: occupancy: 8 # FULL-NEXT: vgprForAGPRCopy: '' +# FULL-NEXT: longBranchReservedReg: '' # FULL-NEXT: body: # SIMPLE: machineFunctionInfo: @@ -219,6 +221,7 @@ # FULL-NEXT: highBitsOf32BitAddress: 0 # FULL-NEXT: occupancy: 8 # FULL-NEXT: vgprForAGPRCopy: '' +# FULL-NEXT: longBranchReservedReg: '' # FULL-NEXT: body: # SIMPLE: machineFunctionInfo: @@ -291,6 +294,7 @@ # FULL-NEXT: highBitsOf32BitAddress: 0 # FULL-NEXT: occupancy: 8 # FULL-NEXT: vgprForAGPRCopy: '' +# FULL-NEXT: longBranchReservedReg: '' # FULL-NEXT: body: # SIMPLE: machineFunctionInfo: diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll @@ -42,6 +42,7 @@ ; CHECK-NEXT: highBitsOf32BitAddress: 0 ; CHECK-NEXT: occupancy: 8 ; CHECK-NEXT: vgprForAGPRCopy: '' +; CHECK-NEXT: longBranchReservedReg: '' ; CHECK-NEXT: body: define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) { %gep = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %arg0 @@ -84,6 +85,7 @@ ; CHECK-NEXT: highBitsOf32BitAddress: 0 ; CHECK-NEXT: occupancy: 10 ; CHECK-NEXT: vgprForAGPRCopy: '' +; CHECK-NEXT: longBranchReservedReg: '' ; CHECK-NEXT: body: define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) { %gep = getelementptr inbounds [128 x i32], ptr addrspace(2) @gds, i32 0, i32 %arg0 @@ -150,6 +152,7 @@ ; CHECK-NEXT: highBitsOf32BitAddress: 0 ; CHECK-NEXT: occupancy: 8 ; CHECK-NEXT: vgprForAGPRCopy: '' +; CHECK-NEXT: longBranchReservedReg: '' ; CHECK-NEXT: body: define void @function() { ret void @@ -198,6 +201,7 @@ ; CHECK-NEXT: highBitsOf32BitAddress: 0 ; CHECK-NEXT: occupancy: 8 ; CHECK-NEXT: vgprForAGPRCopy: '' +; CHECK-NEXT: longBranchReservedReg: '' ; CHECK-NEXT: body: define void @function_nsz() #0 { ret void