Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -50,6 +50,8 @@ FunctionPass *createSIOptimizeExecMaskingPreRAPass(); FunctionPass *createSIFixSGPRCopiesPass(); FunctionPass *createSIMemoryLegalizerPass(); +FunctionPass *createSIInsertScratchBoundsPass(); +FunctionPass *createSIFixScratchSizePass(); FunctionPass *createSIInsertWaitcntsPass(); FunctionPass *createSIPreAllocateWWMRegsPass(); FunctionPass *createSIFormMemoryClausesPass(); @@ -142,6 +144,13 @@ void initializeSILowerControlFlowPass(PassRegistry &); extern char &SILowerControlFlowID; +void initializeSIInsertScratchBoundsPass(PassRegistry &); +extern char &SIInsertScratchBoundsID; + +void initializeSIFixScratchSizePass(PassRegistry &); +extern char &SIFixScratchSizeID; +extern const char *const SIScratchSizeSymbol; + void initializeSIInsertSkipsPass(PassRegistry &); extern char &SIInsertSkipsPassID; Index: lib/Target/AMDGPU/AMDGPU.td =================================================================== --- lib/Target/AMDGPU/AMDGPU.td +++ lib/Target/AMDGPU/AMDGPU.td @@ -489,6 +489,13 @@ "Force using DS instruction immediate offsets on SI" >; +def FeatureEnableScratchBoundsChecks : SubtargetFeature< + "enable-scratch-bounds-checks", + "EnableScratchBoundsChecks", + "true", + "Enable insertion of bounds checks on scratch accesses" +>; + def FeatureEnableSIScheduler : SubtargetFeature<"si-scheduler", "EnableSIScheduler", "true", Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -301,6 +301,7 @@ // Used as options. bool EnableLoadStoreOpt; bool EnableUnsafeDSOffsetFolding; + bool EnableScratchBoundsChecks; bool EnableSIScheduler; bool EnableDS128; bool EnablePRTStrictNull; @@ -846,6 +847,10 @@ bool hasMadF16() const; + bool enableScratchBoundsChecks() const { + return EnableScratchBoundsChecks; + } + bool enableSIScheduler() const { return EnableSIScheduler; } Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -192,6 +192,7 @@ EnableLoadStoreOpt(false), EnableUnsafeDSOffsetFolding(false), + EnableScratchBoundsChecks(false), EnableSIScheduler(false), EnableDS128(false), EnablePRTStrictNull(false), Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -896,6 +896,7 @@ if (LateCFGStructurize) { addPass(createAMDGPUMachineCFGStructurizerPass()); } + addPass(createSIInsertScratchBoundsPass()); addPass(createSIWholeQuadModePass()); } @@ -959,6 +960,7 @@ addPass(createSIInsertWaitcntsPass()); addPass(createSIShrinkInstructionsPass()); addPass(createSIModeRegisterPass()); + addPass(createSIFixScratchSizePass()); // The hazard recognizer that runs as part of the post-ra scheduler does not // guarantee to be able handle all hazards correctly. This is because if there Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -92,6 +92,7 @@ R600RegisterInfo.cpp SIAddIMGInit.cpp SIAnnotateControlFlow.cpp + SIFixScratchSize.cpp SIFixSGPRCopies.cpp SIFixupVectorISel.cpp SIFixVGPRCopies.cpp @@ -99,6 +100,7 @@ SIFoldOperands.cpp SIFormMemoryClauses.cpp SIFrameLowering.cpp + SIInsertScratchBounds.cpp SIInsertSkips.cpp SIInsertWaitcnts.cpp SIInstrInfo.cpp Index: lib/Target/AMDGPU/SIFixScratchSize.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/SIFixScratchSize.cpp @@ -0,0 +1,82 @@ +//===- SIFixScratchSize.cpp - resolve scratch size symbols -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass replaces references with to the scratch size symbol with the +/// actual scratch size. This pass should be run late, i.e. when the scratch +/// size for a given machine function is known. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" + +#include + +using namespace llvm; + +#define DEBUG_TYPE "si-fix-scratch-size" + +namespace { + +class SIFixScratchSize : public MachineFunctionPass { +public: + static char ID; + + SIFixScratchSize() : MachineFunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; +}; + +} // end anonymous namespace + +INITIALIZE_PASS(SIFixScratchSize, DEBUG_TYPE, + "SI Resolve Scratch Size Symbols", + false, false) + +char SIFixScratchSize::ID = 0; + +char &llvm::SIFixScratchSizeID = SIFixScratchSize::ID; + +const char *const llvm::SIScratchSizeSymbol = "___SCRATCH_SIZE"; + +FunctionPass *llvm::createSIFixScratchSizePass() { + return new SIFixScratchSize; +} + +bool SIFixScratchSize::runOnMachineFunction(MachineFunction &MF) { + const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + const uint64_t StackSize = FrameInfo.getStackSize(); + + bool Changed = false; + + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (MI.getOpcode() == AMDGPU::S_MOV_B32) { + MachineOperand& Src = MI.getOperand(1); + if (Src.isSymbol()) { + if (strcmp(Src.getSymbolName(), SIScratchSizeSymbol) == 0) { + LLVM_DEBUG(dbgs() << "Fixing: " << MI << "\n"); + Src.ChangeToImmediate(StackSize); + Changed = true; + } + } + } + } + } + + return Changed; +} Index: lib/Target/AMDGPU/SIInsertScratchBounds.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/SIInsertScratchBounds.cpp @@ -0,0 +1,354 @@ +//===- SIInsertScratchBounds.cpp - insert scratch bounds checks -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass inserts bounds checks on scratch accesses. +/// Out-of-bounds reads return zero, and out-of-bounds writes have no effect. +/// This is intended to be used on GFX9 where bounds checking is no longer +/// performed by hardware and hence page faults can results from out-of-bounds +/// accesses by shaders. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" + +#include + +using namespace llvm; + +#define DEBUG_TYPE "si-insert-scratch-bounds" + +namespace { + +class SIInsertScratchBounds : public MachineFunctionPass { +private: + const GCNSubtarget *ST; + const SIInstrInfo *TII; + MachineRegisterInfo *MRI; + const SIRegisterInfo *RI; + +public: + static char ID; + + SIInsertScratchBounds() : MachineFunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool insertBoundsCheck(MachineFunction &MF, MachineInstr *MI, + const int64_t SizeEstimate, + const unsigned SizeReg, + MachineBasicBlock **NextBB); + + bool runOnMachineFunction(MachineFunction &MF) override; +}; + +static void zeroReg(MachineBasicBlock &MBB, MachineRegisterInfo *MRI, + const SIRegisterInfo *RI, const SIInstrInfo *TII, + MachineBasicBlock::iterator &I, const DebugLoc &DL, + unsigned Reg) { + + auto EndDstRC = MRI->getRegClass(Reg); + uint32_t RegSize = RI->getRegSizeInBits(*EndDstRC) / 32; + + assert(RI->isVGPR(*MRI, Reg) && "can only zero VGPRs"); + + if (RegSize == 1) + BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Reg).addImm(0); + else { + SmallVector TRegs; + for (unsigned i = 0; i < RegSize; ++i) { + unsigned TReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), TReg).addImm(0); + TRegs.push_back(TReg); + } + MachineInstrBuilder MIB = + BuildMI(MBB, I, DL, TII->get(AMDGPU::REG_SEQUENCE), Reg); + for (unsigned i = 0; i < RegSize; ++i) { + MIB.addReg(TRegs[i]); + MIB.addImm(RI->getSubRegFromChannel(i)); + } + } +} + +static void cndmask0Reg(MachineBasicBlock &MBB, MachineRegisterInfo *MRI, + const SIRegisterInfo *RI, const SIInstrInfo *TII, + MachineBasicBlock::iterator &I, const DebugLoc &DL, + unsigned SrcReg, unsigned MaskReg, bool KillMask, + unsigned DstReg) { + + auto EndDstRC = MRI->getRegClass(DstReg); + uint32_t RegSize = RI->getRegSizeInBits(*EndDstRC) / 32; + + assert(RI->isVGPR(*MRI, DstReg) && "can only cndmask VGPRs"); + + if (RegSize == 1) + BuildMI(MBB, I, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addImm(0) + .addImm(0) + .addImm(0) + .addReg(SrcReg) + .addReg(MaskReg, getKillRegState(KillMask)); + else { + SmallVector TRegs; + for (unsigned i = 0; i < RegSize; ++i) { + unsigned TReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(MBB, I, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), TReg) + .addImm(0) + .addImm(0) + .addImm(0) + .addReg(SrcReg, 0, AMDGPU::sub0 + i) + .addReg(MaskReg, getKillRegState(KillMask && (i == (RegSize - 1)))); + TRegs.push_back(TReg); + } + MachineInstrBuilder MIB = + BuildMI(MBB, I, DL, TII->get(AMDGPU::REG_SEQUENCE), DstReg); + for (unsigned i = 0; i < RegSize; ++i) { + MIB.addReg(TRegs[i]); + MIB.addImm(RI->getSubRegFromChannel(i)); + } + } +} + +} // end anonymous namespace + +INITIALIZE_PASS(SIInsertScratchBounds, DEBUG_TYPE, + "SI Insert Scratch Bounds Checks", + false, false) + +char SIInsertScratchBounds::ID = 0; + +char &llvm::SIInsertScratchBoundsID = SIInsertScratchBounds::ID; + +FunctionPass *llvm::createSIInsertScratchBoundsPass() { + return new SIInsertScratchBounds; +} + +bool SIInsertScratchBounds::insertBoundsCheck(MachineFunction &MF, + MachineInstr *MI, + const int64_t SizeEstimate, + const unsigned SizeReg, + MachineBasicBlock **NextBB) { + const bool IsLoad = MI->mayLoad(); + DebugLoc DL = MI->getDebugLoc(); + + const MachineOperand *Offset = + TII->getNamedOperand(*MI, AMDGPU::OpName::offset); + const MachineOperand *VAddr = + TII->getNamedOperand(*MI, AMDGPU::OpName::vaddr); + const MachineOperand *Addr = + VAddr ? VAddr : TII->getNamedOperand(*MI, AMDGPU::OpName::saddr); + + if (!Addr || !Addr->isReg()) { + // Constant offset -> determine bounds check statically + if (Offset->getImm() < SizeEstimate) { + // Statically in bounds + return false; + } + // Else: estimate may be revised upward so we cannot statically delete + } + + // Setup new block structure + MachineBasicBlock *PreAccessBB = MI->getParent(); + MachineBasicBlock *ScratchAccessBB = MF.CreateMachineBasicBlock(); + MachineBasicBlock *PostAccessBB = MF.CreateMachineBasicBlock(); + *NextBB = PostAccessBB; + + MachineFunction::iterator MBBI(*PreAccessBB); + ++MBBI; + + MF.insert(MBBI, ScratchAccessBB); + MF.insert(MBBI, PostAccessBB); + + ScratchAccessBB->addSuccessor(PostAccessBB); + + // Move instructions following scratch access to new basic block + MachineBasicBlock::iterator SuccI(*MI); + ++SuccI; + PostAccessBB->transferSuccessorsAndUpdatePHIs(PreAccessBB); + PostAccessBB->splice( + PostAccessBB->begin(), PreAccessBB, SuccI, PreAccessBB->end() + ); + + PreAccessBB->addSuccessor(ScratchAccessBB); + + // Move scratch access to its own basic block + MI->removeFromParent(); + ScratchAccessBB->insertAfter(ScratchAccessBB->begin(), MI); + + MachineBasicBlock::iterator PreI = PreAccessBB->end(); + MachineBasicBlock::iterator PostI = PostAccessBB->begin(); + MachineBasicBlock::iterator ScratchI = ScratchAccessBB->end(); + unsigned AddrReg; + bool KillAddr = false; + + if (Offset && (Offset->getImm() > 0)) { + AddrReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + KillAddr = true; + + if (Addr && Addr->isReg()) { + TII->getAddNoCarry(*PreAccessBB, PreI, DL, AddrReg) + .addImm(Offset->getImm()) + .addReg(Addr->getReg()) + .addImm(0); // clamp bit + } else { + BuildMI(*PreAccessBB, PreI, DL, + TII->get(AMDGPU::V_MOV_B32_e32), AddrReg) + .addImm(Offset->getImm()); + } + } else { + assert(Addr); + AddrReg = Addr->getReg(); + } + + if (RI->isVGPR(*MRI, AddrReg)) { + const unsigned CondReg + = MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + const unsigned ExecReg + = MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + + BuildMI(*PreAccessBB, PreI, DL, + TII->get(AMDGPU::V_CMP_LT_U32_e64), CondReg) + .addReg(AddrReg, getKillRegState(KillAddr)) + .addReg(SizeReg); + BuildMI(*PreAccessBB, PreI, DL, + TII->get(AMDGPU::S_AND_SAVEEXEC_B64), ExecReg) + .addReg(CondReg, getKillRegState(!IsLoad)); + BuildMI(*ScratchAccessBB, ScratchI, DL, + TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) + .addReg(ExecReg, RegState::Kill); + + if (IsLoad) { + MachineOperand &Dst = MI->getOperand(0); + const unsigned DstReg = Dst.getReg(); + const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg); + const unsigned LoadDstReg = MRI->createVirtualRegister(DstRC); + + Dst.setReg(LoadDstReg); + + cndmask0Reg(*PostAccessBB, MRI, RI, TII, PostI, DL, + LoadDstReg, CondReg, true, DstReg); + } + } else { + if (MI->mayLoad()) { + // Load -> scalar comparison, then load, else load zero + MachineBasicBlock *OutOfBoundsBB = MF.CreateMachineBasicBlock(); + MachineBasicBlock::iterator OOBI = OutOfBoundsBB->end(); + + MBBI--; + MF.insert(MBBI, OutOfBoundsBB); + OutOfBoundsBB->addSuccessor(PostAccessBB); + PreAccessBB->addSuccessor(OutOfBoundsBB); + + // TODO: mark SCC as clobbered? + BuildMI(*PreAccessBB, PreI, DL, TII->get(AMDGPU::S_CMP_LT_U32)) + .addReg(AddrReg, getKillRegState(KillAddr)) + .addReg(SizeReg); + BuildMI(*PreAccessBB, PreI, DL, TII->get(AMDGPU::S_CBRANCH_SCC0)) + .addMBB(OutOfBoundsBB); + + BuildMI(*ScratchAccessBB, ScratchI, DL, TII->get(AMDGPU::S_BRANCH)) + .addMBB(PostAccessBB); + + MachineOperand &Dst = MI->getOperand(0); + const unsigned DstReg = Dst.getReg(); + + const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg); + const unsigned LoadDstReg = MRI->createVirtualRegister(DstRC); + const unsigned ZeroDstReg = MRI->createVirtualRegister(DstRC); + + zeroReg(*OutOfBoundsBB, MRI, RI, TII, OOBI, DL, ZeroDstReg); + + BuildMI(*PostAccessBB, PostI, DL, TII->get(TargetOpcode::PHI), DstReg) + .addReg(LoadDstReg) + .addMBB(ScratchAccessBB) + .addReg(ZeroDstReg) + .addMBB(OutOfBoundsBB); + + Dst.setReg(LoadDstReg); + } else { + // Store -> scalar comparison and skip store + // TODO: mark SCC as clobbered? + BuildMI(*PreAccessBB, PreI, DL, TII->get(AMDGPU::S_CMP_LT_U32)) + .addReg(AddrReg, getKillRegState(KillAddr)) + .addReg(SizeReg); + BuildMI(*PreAccessBB, PreI, DL, TII->get(AMDGPU::S_CBRANCH_SCC0)) + .addMBB(PostAccessBB); + PreAccessBB->addSuccessor(PostAccessBB); + } + } + + return true; +} + +bool SIInsertScratchBounds::runOnMachineFunction(MachineFunction &MF) { + ST = &MF.getSubtarget(); + TII = ST->getInstrInfo(); + MRI = &MF.getRegInfo(); + RI = ST->getRegisterInfo(); + + if (!ST->enableScratchBoundsChecks()) + return false; + + const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + const int64_t ScratchSizeEstimate = + (int64_t) FrameInfo.estimateStackSize(MF); + + bool Changed = false; + unsigned SizeReg = 0; // defer assigning a register until required + + MachineFunction::iterator NextBB; + for (MachineFunction::iterator BI = MF.begin(); + BI != MF.end(); BI = NextBB) { + NextBB = std::next(BI); + MachineBasicBlock &MBB = *BI; + MachineBasicBlock *NewNextBB = nullptr; + + for (MachineInstr &MI : MBB) { + if (MI.mayLoad() || MI.mayStore()) { + for (const auto &MMO : MI.memoperands()) { + const unsigned AddrSpace = MMO->getPointerInfo().getAddrSpace(); + if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) { + // uses scratch; needs to be processed + if (!SizeReg) + SizeReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + + Changed |= insertBoundsCheck( + MF, &MI, ScratchSizeEstimate, SizeReg, + &NewNextBB + ); + break; + } + } + } + if (NewNextBB) { + // Restart at the newly created next BB + NextBB = MachineFunction::iterator(*NewNextBB); + break; + } + } + } + + // If scratch size is required then add to prelude + if (Changed) { + MachineBasicBlock *PreludeBB = &MF.front(); + MachineBasicBlock::iterator PreludeI = PreludeBB->begin(); + DebugLoc UnknownDL; + + BuildMI(*PreludeBB, PreludeI, UnknownDL, + TII->get(AMDGPU::S_MOV_B32), SizeReg) + .addExternalSymbol(SIScratchSizeSymbol); + } + + return Changed; +} Index: test/CodeGen/AMDGPU/scratch-bounds.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/scratch-bounds.ll @@ -0,0 +1,226 @@ +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=+max-private-element-size-16,+enable-scratch-bounds-checks < %s | FileCheck -enable-var-scope -check-prefix=GCN %s + +; GCN-LABEL: {{^}}bounds_check_load_i32: +; GCN: s_mov_b32 [[BOUNDS:s[0-9]+]], 0x8004 +; GCN: v_cmp_gt_u32_e64 [[BOUNDSMASK:s\[[0-9]+:[0-9]+\]]], [[BOUNDS]], [[OFFSET:v[0-9]+]] +; GCN: s_and_saveexec_b64 [[EXECMASK:s\[[0-9]+:[0-9]+\]]], [[BOUNDSMASK]] +; GCN: buffer_load_dword [[LOADVALUE:v[0-9]+]], [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}} +; GCN: s_mov_b64 exec, [[EXECMASK]] +; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, [[LOADVALUE]], [[BOUNDSMASK]] + +define amdgpu_kernel void @bounds_check_load_i32(i32 addrspace(1)* %out, i32 %offset) { +entry: + %scratch = alloca [8192 x i32], addrspace(5) + + %ptr = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch, i32 0, i32 %offset + %value = load i32, i32 addrspace(5)* %ptr + store i32 %value, i32 addrspace(1)* %out + + ret void +} + +; GCN-LABEL: {{^}}bounds_check_store_i32: +; GCN: s_mov_b32 [[BOUNDS:s[0-9]+]], 0x8004 +; GCN: v_cmp_gt_u32_e64 [[BOUNDSMASK:s\[[0-9]+:[0-9]+\]]], [[BOUNDS]], [[OFFSET:v[0-9]+]] +; GCN: s_and_saveexec_b64 [[EXECMASK:s\[[0-9]+:[0-9]+\]]], [[BOUNDSMASK]] +; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}} +; GCN: s_mov_b64 exec, [[EXECMASK]] + +define amdgpu_kernel void @bounds_check_store_i32(i32 addrspace(1)* %out, i32 %value, i32 %offset) { +entry: + %scratch = alloca [8192 x i32], addrspace(5) + + %ptr = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch, i32 0, i32 %offset + store i32 %value, i32 addrspace(5)* %ptr + store i32 %value, i32 addrspace(1)* %out + + ret void +} + +; GCN-LABEL: {{^}}bounds_check_load_i64: +; GCN: s_mov_b32 [[BOUNDS:s[0-9]+]], 0x8008 +; GCN: s_and_saveexec_b64 [[EXECMASK:s\[[0-9]+:[0-9]+\]]], [[BOUNDSMASK:s\[[0-9]+:[0-9]+\]]] +; GCN: buffer_load_dwordx2 v{{\[}}[[LOADLO:[0-9]+]]:[[LOADHI:[0-9]+]]{{\]}}, [[OFFSET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}} +; GCN: s_mov_b64 exec, [[EXECMASK]] +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v[[LOADLO]], [[BOUNDSMASK]] +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v[[LOADHI]], [[BOUNDSMASK]] + +define amdgpu_kernel void @bounds_check_load_i64(i64 addrspace(1)* %out, i32 %offset) { +entry: + %scratch = alloca [4096 x i64], addrspace(5) + + %ptr = getelementptr [4096 x i64], [4096 x i64] addrspace(5)* %scratch, i32 0, i32 %offset + %value = load i64, i64 addrspace(5)* %ptr + store i64 %value, i64 addrspace(1)* %out + + ret void +} + +; GCN-LABEL: {{^}}bounds_check_store_i64: +; GCN: s_mov_b32 [[BOUNDS:s[0-9]+]], 0x8008 +; GCN: s_and_saveexec_b64 [[EXECMASK:s\[[0-9]+:[0-9]+\]]], [[BOUNDSMASK:s\[[0-9]+:[0-9]+\]]] +; GCN: buffer_store_dwordx2 v[{{[0-9]+}}:{{[0-9]+}}], [[OFFSET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}} +; GCN: s_mov_b64 exec, [[EXECMASK]] + +define amdgpu_kernel void @bounds_check_store_i64(i64 addrspace(1)* %out, i64 %value, i32 %offset) { +entry: + %scratch = alloca [4096 x i64], addrspace(5) + + %ptr = getelementptr [4096 x i64], [4096 x i64] addrspace(5)* %scratch, i32 0, i32 %offset + store i64 %value, i64 addrspace(5)* %ptr + store i64 %value, i64 addrspace(1)* %out + + ret void +} + +; GCN-LABEL: {{^}}bounds_check_load_i128: +; GCN: s_mov_b32 [[BOUNDS:s[0-9]+]], 0x8008 +; GCN: s_and_saveexec_b64 [[EXECMASK:s\[[0-9]+:[0-9]+\]]], [[BOUNDSMASK:s\[[0-9]+:[0-9]+\]]] +; GCN: buffer_load_dwordx4 v{{\[}}[[LOADLO:[0-9]+]]:[[LOADHI:[0-9]+]]{{\]}}, [[OFFSET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}} +; GCN: s_mov_b64 exec, [[EXECMASK]] +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v[[LOADLO]], [[BOUNDSMASK]] +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v[[LOADHI]], [[BOUNDSMASK]] + +define amdgpu_kernel void @bounds_check_load_i128(i128 addrspace(1)* %out, i32 %offset) { +entry: + %scratch = alloca [2048 x i128], addrspace(5) + + %ptr = getelementptr [2048 x i128], [2048 x i128] addrspace(5)* %scratch, i32 0, i32 %offset + %value = load i128, i128 addrspace(5)* %ptr + store i128 %value, i128 addrspace(1)* %out + + ret void +} + +; GCN-LABEL: {{^}}bounds_check_store_i128: +; GCN: s_mov_b32 [[BOUNDS:s[0-9]+]], 0x8008 +; GCN: s_and_saveexec_b64 [[EXECMASK:s\[[0-9]+:[0-9]+\]]], [[BOUNDSMASK:s\[[0-9]+:[0-9]+\]]] +; GCN: buffer_store_dwordx4 v[{{[0-9]+}}:{{[0-9]+}}], [[OFFSET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}} +; GCN: s_mov_b64 exec, [[EXECMASK]] + +define amdgpu_kernel void @bounds_check_store_i128(i128 addrspace(1)* %out, i128 %value, i32 %offset) { +entry: + %scratch = alloca [2048 x i128], addrspace(5) + + %ptr = getelementptr [2048 x i128], [2048 x i128] addrspace(5)* %scratch, i32 0, i32 %offset + store i128 %value, i128 addrspace(5)* %ptr + store i128 %value, i128 addrspace(1)* %out + + ret void +} + +; GCN-LABEL: {{^}}bounds_check_static_valid_store_i32: +; GCN-NOT: s_and_saveexec_b64 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offset:20 + +define amdgpu_kernel void @bounds_check_static_valid_store_i32(i32 addrspace(1)* %out, i32 %value, i32 %offset) { +entry: + %scratch = alloca [256 x i32], addrspace(5) + + %ptr = getelementptr [256 x i32], [256 x i32] addrspace(5)* %scratch, i32 0, i32 4 + store i32 %value, i32 addrspace(5)* %ptr + + %load_ptr = getelementptr [256 x i32], [256 x i32] addrspace(5)* %scratch, i32 0, i32 %offset + %val = load i32, i32 addrspace(5)* %load_ptr + store i32 %val, i32 addrspace(1)* %out + + ret void +} + +; GCN-LABEL: {{^}}bounds_check_static_oob_store_i32: +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offset:2052 + +define amdgpu_kernel void @bounds_check_static_oob_store_i32(i32 addrspace(1)* %out, i32 %value, i32 %offset) { +entry: + %scratch = alloca [256 x i32], addrspace(5) + + %ptr = getelementptr [256 x i32], [256 x i32] addrspace(5)* %scratch, i32 0, i32 512 + store i32 %value, i32 addrspace(5)* %ptr + + %load_ptr = getelementptr [256 x i32], [256 x i32] addrspace(5)* %scratch, i32 0, i32 %offset + %val = load i32, i32 addrspace(5)* %load_ptr + store i32 %val, i32 addrspace(1)* %out + + ret void +} + +; GCN-LABEL: {{^}}bounds_check_static_valid_load_i32: +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}} +; GCN-NOT: s_and_saveexec_b64 +; GCN: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offset:20 +; GCN-NOT: v_cndmask_b32_e64 + +define amdgpu_kernel void @bounds_check_static_valid_load_i32(i32 addrspace(1)* %out, i32 %value, i32 %offset) { +entry: + %scratch = alloca [256 x i32], addrspace(5) + + %store_ptr = getelementptr [256 x i32], [256 x i32] addrspace(5)* %scratch, i32 0, i32 %offset + store i32 %value, i32 addrspace(5)* %store_ptr + + %ptr = getelementptr [256 x i32], [256 x i32] addrspace(5)* %scratch, i32 0, i32 4 + %val = load i32, i32 addrspace(5)* %ptr + + store i32 %val, i32 addrspace(1)* %out + + ret void +} + +; GCN-LABEL: {{^}}bounds_check_static_oob_load_i32: +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}} +; GCN: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offset:2052 + +define amdgpu_kernel void @bounds_check_static_oob_load_i32(i32 addrspace(1)* %out, i32 %value, i32 %offset) { +entry: + %scratch = alloca [256 x i32], addrspace(5) + + %store_ptr = getelementptr [256 x i32], [256 x i32] addrspace(5)* %scratch, i32 0, i32 %offset + store i32 %value, i32 addrspace(5)* %store_ptr + + %ptr = getelementptr [256 x i32], [256 x i32] addrspace(5)* %scratch, i32 0, i32 512 + %val = load i32, i32 addrspace(5)* %ptr + + store i32 %val, i32 addrspace(1)* %out + + ret void +} + +; GCN-LABEL: {{^}}bounds_check_load_offset_i32: +; GCN: s_mov_b32 [[BOUNDS:s[0-9]+]], 0x8004 +; GCN: v_add_u32_e32 [[CMPOFFSET:v[0-9]+]], 16, [[OFFSET:v[0-9]+]] +; GCN: v_cmp_gt_u32_e64 [[BOUNDSMASK:s\[[0-9]+:[0-9]+\]]], [[BOUNDS]], [[CMPOFFSET]] +; GCN: s_and_saveexec_b64 [[EXECMASK:s\[[0-9]+:[0-9]+\]]], [[BOUNDSMASK]] +; GCN: buffer_load_dword [[LOADVALUE:v[0-9]+]], [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen offset:16{{$}} +; GCN: s_mov_b64 exec, [[EXECMASK]] +; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, [[LOADVALUE]], [[BOUNDSMASK]] + +define amdgpu_kernel void @bounds_check_load_offset_i32(i32 addrspace(1)* %out, i32 %offset) { +entry: + %scratch = alloca [8192 x i32], addrspace(5) + + %ptr.0 = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch, i32 0, i32 %offset + %ptr.1 = getelementptr i32, i32 addrspace(5)* %ptr.0, i32 4 + %value = load i32, i32 addrspace(5)* %ptr.1 + store i32 %value, i32 addrspace(1)* %out + + ret void +} + +; GCN-LABEL: {{^}}bounds_check_store_offset_i32: +; GCN: s_mov_b32 [[BOUNDS:s[0-9]+]], 0x8004 +; GCN: v_add_u32_e32 [[CMPOFFSET:v[0-9]+]], 16, [[OFFSET:v[0-9]+]] +; GCN: v_cmp_gt_u32_e64 [[BOUNDSMASK:s\[[0-9]+:[0-9]+\]]], [[BOUNDS]], [[CMPOFFSET]] +; GCN: s_and_saveexec_b64 [[EXECMASK:s\[[0-9]+:[0-9]+\]]], [[BOUNDSMASK]] +; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen offset:16{{$}} +; GCN: s_mov_b64 exec, [[EXECMASK]] + +define amdgpu_kernel void @bounds_check_store_offset_i32(i32 addrspace(1)* %out, i32 %value, i32 %offset) { +entry: + %scratch = alloca [8192 x i32], addrspace(5) + + %ptr.0 = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch, i32 0, i32 %offset + %ptr.1 = getelementptr i32, i32 addrspace(5)* %ptr.0, i32 4 + store i32 %value, i32 addrspace(5)* %ptr.1 + store i32 %value, i32 addrspace(1)* %out + + ret void +}