diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -296,6 +296,31 @@ .addMemOperand(MMO); } +static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + const DebugLoc &DL, const SIInstrInfo *TII, + Register TargetReg) { + MachineFunction *MF = MBB.getParent(); + const SIMachineFunctionInfo *MFI = MF->getInfo(); + const SIRegisterInfo *TRI = &TII->getRegisterInfo(); + const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); + Register TargetLo = TRI->getSubReg(TargetReg, AMDGPU::sub0); + Register TargetHi = TRI->getSubReg(TargetReg, AMDGPU::sub1); + + if (MFI->getGITPtrHigh() != 0xffffffff) { + BuildMI(MBB, I, DL, SMovB32, TargetHi) + .addImm(MFI->getGITPtrHigh()) + .addReg(TargetReg, RegState::ImplicitDefine); + } else { + const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64); + BuildMI(MBB, I, DL, GetPC64, TargetReg); + } + Register GitPtrLo = MFI->getGITPtrLoReg(*MF); + MF->getRegInfo().addLiveIn(GitPtrLo); + MBB.addLiveIn(GitPtrLo); + BuildMI(MBB, I, DL, SMovB32, TargetLo) + .addReg(GitPtrLo); +} + // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()` void SIFrameLowering::emitEntryFunctionFlatScratchInit( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, @@ -315,16 +340,74 @@ // pointer. Because we only detect if flat instructions are used at all, // this will be used more often than necessary on VI. - Register FlatScratchInitReg = - MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); - assert(FlatScratchInitReg); + Register FlatScrInitLo; + Register FlatScrInitHi; - MachineRegisterInfo &MRI = MF.getRegInfo(); - MRI.addLiveIn(FlatScratchInitReg); - MBB.addLiveIn(FlatScratchInitReg); + if (ST.isAmdPalOS()) { + // Extract the scratch offset from the descriptor in the GIT + LivePhysRegs LiveRegs; + LiveRegs.init(*TRI); + LiveRegs.addLiveIns(MBB); + + // Find unused reg to load flat scratch init into + MachineRegisterInfo &MRI = MF.getRegInfo(); + Register FlatScrInit = AMDGPU::NoRegister; + ArrayRef AllSGPR64s = TRI->getAllSGPR64(MF); + unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2; + AllSGPR64s = AllSGPR64s.slice( + std::min(static_cast(AllSGPR64s.size()), NumPreloaded)); + Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); + for (MCPhysReg Reg : AllSGPR64s) { + if (LiveRegs.available(MRI, Reg) && MRI.isAllocatable(Reg) && + !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) { + FlatScrInit = Reg; + break; + } + } + assert(FlatScrInit && "Failed to find free register for scratch init"); - Register FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); - Register FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); + FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0); + FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1); + + buildGitPtr(MBB, I, DL, TII, FlatScrInit); + + // We now have the GIT ptr - now get the scratch descriptor from the entry + // at offset 0 (or offset 16 for a compute shader). + MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); + const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM); + auto *MMO = MF.getMachineMemOperand( + PtrInfo, + MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant | + MachineMemOperand::MODereferenceable, + 8, Align(4)); + unsigned Offset = + MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; + const GCNSubtarget &Subtarget = MF.getSubtarget(); + unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset); + BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit) + .addReg(FlatScrInit) + .addImm(EncodedOffset) // offset + .addImm(0) // glc + .addImm(0) // dlc + .addMemOperand(MMO); + + // Mask the offset in [47:0] of the descriptor + const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32); + BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi) + .addReg(FlatScrInitHi) + .addImm(0xffff); + } else { + Register FlatScratchInitReg = + MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); + assert(FlatScratchInitReg); + + MachineRegisterInfo &MRI = MF.getRegInfo(); + MRI.addLiveIn(FlatScratchInitReg); + MBB.addLiveIn(FlatScratchInitReg); + + FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); + FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); + } // Do a 64-bit pointer add. if (ST.flatScratchIsPointer()) { @@ -582,26 +665,9 @@ if (ST.isAmdPalOS()) { // The pointer to the GIT is formed from the offset passed in and either // the amdgpu-git-ptr-high function attribute or the top part of the PC - Register RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); - Register RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); - const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); - - if (MFI->getGITPtrHigh() != 0xffffffff) { - BuildMI(MBB, I, DL, SMovB32, RsrcHi) - .addImm(MFI->getGITPtrHigh()) - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - } else { - const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64); - BuildMI(MBB, I, DL, GetPC64, Rsrc01); - } - Register GitPtrLo = MFI->getGITPtrLoReg(MF); - MF.getRegInfo().addLiveIn(GitPtrLo); - MBB.addLiveIn(GitPtrLo); - BuildMI(MBB, I, DL, SMovB32, RsrcLo) - .addReg(GitPtrLo) - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + buildGitPtr(MBB, I, DL, TII, Rsrc01); // We now have the GIT ptr - now get the scratch descriptor from the entry // at offset 0 (or offset 16 for a compute shader). diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2030,7 +2030,7 @@ CCInfo.AllocateReg(DispatchIDReg); } - if (Info.hasFlatScratchInit()) { + if (Info.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) { Register FlatScratchInitReg = Info.addFlatScratchInit(TRI); MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(FlatScratchInitReg); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -324,6 +324,10 @@ /// of the subtarget. ArrayRef getAllSGPR128(const MachineFunction &MF) const; + /// Return all SGPR64 which satisfy the waves per execution unit requirement + /// of the subtarget. + ArrayRef getAllSGPR64(const MachineFunction &MF) const; + /// Return all SGPR32 which satisfy the waves per execution unit requirement /// of the subtarget. ArrayRef getAllSGPR32(const MachineFunction &MF) const; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -2151,6 +2151,12 @@ ST.getMaxNumSGPRs(MF) / 4); } +ArrayRef +SIRegisterInfo::getAllSGPR64(const MachineFunction &MF) const { + return makeArrayRef(AMDGPU::SGPR_64RegClass.begin(), + ST.getMaxNumSGPRs(MF) / 2); +} + ArrayRef SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const { return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF)); diff --git a/llvm/test/CodeGen/AMDGPU/SRSRC-GIT-clobber-check.mir b/llvm/test/CodeGen/AMDGPU/SRSRC-GIT-clobber-check.mir --- a/llvm/test/CodeGen/AMDGPU/SRSRC-GIT-clobber-check.mir +++ b/llvm/test/CodeGen/AMDGPU/SRSRC-GIT-clobber-check.mir @@ -23,7 +23,7 @@ body: | ; CHECK: $sgpr1 = COPY killed $sgpr5 ; CHECK: $sgpr4_sgpr5 = S_GETPC_B64 - ; CHECK: $sgpr4 = S_MOV_B32 $sgpr8, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7 + ; CHECK: $sgpr4 = S_MOV_B32 $sgpr8 ; CHECK: $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM $sgpr4_sgpr5, 0, 0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7 :: (dereferenceable invariant load 16, align 4, addrspace 4) bb.0: successors: %bb.1, %bb.2 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9-PAL %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10-PAL %s define amdgpu_kernel void @zero_init_kernel() { ; GFX9-LABEL: zero_init_kernel: @@ -45,6 +47,59 @@ ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16 ; GFX10-NEXT: s_endpgm +; +; GFX9-PAL-LABEL: zero_init_kernel: +; GFX9-PAL: ; %bb.0: +; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX9-PAL-NEXT: s_mov_b32 s2, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-PAL-NEXT: s_mov_b32 s0, 0 +; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-PAL-NEXT: s_mov_b32 s1, s0 +; GFX9-PAL-NEXT: s_mov_b32 s2, s0 +; GFX9-PAL-NEXT: s_mov_b32 s3, s0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64 +; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 +; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 +; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 +; GFX9-PAL-NEXT: s_endpgm +; +; GFX10-PAL-LABEL: zero_init_kernel: +; GFX10-PAL: ; %bb.0: +; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX10-PAL-NEXT: s_mov_b32 s2, s0 +; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-PAL-NEXT: s_mov_b32 s0, 0 +; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi +; GFX10-PAL-NEXT: s_mov_b32 s1, s0 +; GFX10-PAL-NEXT: s_mov_b32 s2, s0 +; GFX10-PAL-NEXT: s_mov_b32 s3, s0 +; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:64 +; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48 +; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 +; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16 +; GFX10-PAL-NEXT: s_endpgm %alloca = alloca [32 x i16], align 2, addrspace(5) %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) @@ -89,6 +144,44 @@ ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-PAL-LABEL: zero_init_foo: +; GFX9-PAL: ; %bb.0: +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-PAL-NEXT: s_mov_b32 s0, 0 +; GFX9-PAL-NEXT: s_mov_b32 s1, s0 +; GFX9-PAL-NEXT: s_mov_b32 s2, s0 +; GFX9-PAL-NEXT: s_mov_b32 s3, s0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-PAL-LABEL: zero_init_foo: +; GFX10-PAL: ; %bb.0: +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: s_mov_b32 s0, 0 +; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi +; GFX10-PAL-NEXT: s_mov_b32 s1, s0 +; GFX10-PAL-NEXT: s_mov_b32 s2, s0 +; GFX10-PAL-NEXT: s_mov_b32 s3, s0 +; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 +; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 +; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 +; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] %alloca = alloca [32 x i16], align 2, addrspace(5) %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) @@ -129,6 +222,49 @@ ; GFX10-NEXT: scratch_store_dword off, v0, s0 ; GFX10-NEXT: scratch_load_dword v0, off, s1 ; GFX10-NEXT: s_endpgm +; +; GFX9-PAL-LABEL: store_load_sindex_kernel: +; GFX9-PAL: ; %bb.0: ; %bb +; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] +; GFX9-PAL-NEXT: s_mov_b32 s4, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 +; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 +; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX9-PAL-NEXT: s_add_u32 s1, 4, s1 +; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 +; GFX9-PAL-NEXT: s_add_u32 s0, 4, s0 +; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 +; GFX9-PAL-NEXT: s_endpgm +; +; GFX10-PAL-LABEL: store_load_sindex_kernel: +; GFX10-PAL: ; %bb.0: ; %bb +; GFX10-PAL-NEXT: s_getpc_b64 s[4:5] +; GFX10-PAL-NEXT: s_mov_b32 s4, s0 +; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff +; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3 +; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 +; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 +; GFX10-PAL-NEXT: s_add_u32 s0, 4, s0 +; GFX10-PAL-NEXT: s_add_u32 s1, 4, s1 +; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 +; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 +; GFX10-PAL-NEXT: s_endpgm bb: %i = alloca [32 x float], align 4, addrspace(5) %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* @@ -172,6 +308,46 @@ ; GFX10-NEXT: scratch_store_dword off, v0, s1 ; GFX10-NEXT: scratch_load_dword v0, off, s0 ; GFX10-NEXT: s_endpgm +; +; GFX9-PAL-LABEL: store_load_sindex_foo: +; GFX9-PAL: ; %bb.0: ; %bb +; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX9-PAL-NEXT: s_mov_b32 s2, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 +; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 +; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX9-PAL-NEXT: s_add_u32 s1, 4, s1 +; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 +; GFX9-PAL-NEXT: s_add_u32 s0, 4, s0 +; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 +; GFX9-PAL-NEXT: s_endpgm +; +; GFX10-PAL-LABEL: store_load_sindex_foo: +; GFX10-PAL: ; %bb.0: ; %bb +; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX10-PAL-NEXT: s_mov_b32 s2, s0 +; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 +; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 +; GFX10-PAL-NEXT: s_add_u32 s0, 4, s0 +; GFX10-PAL-NEXT: s_add_u32 s1, 4, s1 +; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 +; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 +; GFX10-PAL-NEXT: s_endpgm bb: %i = alloca [32 x float], align 4, addrspace(5) %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* @@ -213,6 +389,44 @@ ; GFX10-NEXT: scratch_store_dword v2, v3, off ; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 ; GFX10-NEXT: s_endpgm +; +; GFX9-PAL-LABEL: store_load_vindex_kernel: +; GFX9-PAL: ; %bb.0: ; %bb +; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX9-PAL-NEXT: s_mov_b32 s2, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4 +; GFX9-PAL-NEXT: v_add_u32_e32 v2, v1, v0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX9-PAL-NEXT: v_sub_u32_e32 v0, v1, v0 +; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 +; GFX9-PAL-NEXT: s_endpgm +; +; GFX10-PAL-LABEL: store_load_vindex_kernel: +; GFX10-PAL: ; %bb.0: ; %bb +; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX10-PAL-NEXT: s_mov_b32 s2, s0 +; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 4 +; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX10-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 +; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX10-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 +; GFX10-PAL-NEXT: s_endpgm bb: %i = alloca [32 x float], align 4, addrspace(5) %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* @@ -257,6 +471,35 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-PAL-LABEL: store_load_vindex_foo: +; GFX9-PAL: ; %bb.0: ; %bb +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s32 +; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 +; GFX9-PAL-NEXT: v_and_b32_e32 v0, v0, v3 +; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-PAL-LABEL: store_load_vindex_foo: +; GFX10-PAL: ; %bb.0: ; %bb +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s32 +; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi +; GFX10-PAL-NEXT: v_and_b32_e32 v3, v0, v1 +; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v2 +; GFX10-PAL-NEXT: v_lshl_add_u32 v2, v3, 2, v2 +; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off +; GFX10-PAL-NEXT: scratch_load_dword v0, v2, off +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] bb: %i = alloca [32 x float], align 4, addrspace(5) %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* @@ -288,6 +531,24 @@ ; GFX10-NEXT: scratch_store_dword v0, v1, off offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-PAL-LABEL: private_ptr_foo: +; GFX9-PAL: ; %bb.0: +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000 +; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:4 +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-PAL-LABEL: private_ptr_foo: +; GFX10-PAL: ; %bb.0: +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000 +; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi +; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off offset:4 +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1 store float 1.000000e+01, float addrspace(5)* %gep, align 4 ret void @@ -341,6 +602,64 @@ ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:320 ; GFX10-NEXT: s_endpgm +; +; GFX9-PAL-LABEL: zero_init_small_offset_kernel: +; GFX9-PAL: ; %bb.0: +; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX9-PAL-NEXT: s_mov_b32 s2, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-PAL-NEXT: s_mov_b32 s0, 0 +; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 +; GFX9-PAL-NEXT: s_mov_b32 s1, s0 +; GFX9-PAL-NEXT: s_mov_b32 s2, s0 +; GFX9-PAL-NEXT: s_mov_b32 s3, s0 +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272 +; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288 +; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304 +; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320 +; GFX9-PAL-NEXT: s_endpgm +; +; GFX10-PAL-LABEL: zero_init_small_offset_kernel: +; GFX10-PAL: ; %bb.0: +; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX10-PAL-NEXT: s_mov_b32 s2, s0 +; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 +; GFX10-PAL-NEXT: s_mov_b32 s0, 0 +; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi +; GFX10-PAL-NEXT: s_mov_b32 s1, s0 +; GFX10-PAL-NEXT: s_mov_b32 s2, s0 +; GFX10-PAL-NEXT: s_mov_b32 s3, s0 +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272 +; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288 +; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304 +; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:320 +; GFX10-PAL-NEXT: s_endpgm %padding = alloca [64 x i32], align 4, addrspace(5) %alloca = alloca [32 x i16], align 2, addrspace(5) %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef @@ -392,6 +711,48 @@ ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-PAL-LABEL: zero_init_small_offset_foo: +; GFX9-PAL: ; %bb.0: +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-PAL-NEXT: scratch_load_dword v0, off, s32 +; GFX9-PAL-NEXT: s_mov_b32 s0, 0 +; GFX9-PAL-NEXT: s_mov_b32 s1, s0 +; GFX9-PAL-NEXT: s_mov_b32 s2, s0 +; GFX9-PAL-NEXT: s_mov_b32 s3, s0 +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-PAL-LABEL: zero_init_small_offset_foo: +; GFX10-PAL: ; %bb.0: +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: scratch_load_dword v0, off, s32 +; GFX10-PAL-NEXT: s_mov_b32 s0, 0 +; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi +; GFX10-PAL-NEXT: s_mov_b32 s1, s0 +; GFX10-PAL-NEXT: s_mov_b32 s2, s0 +; GFX10-PAL-NEXT: s_mov_b32 s3, s0 +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 +; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 +; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 +; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] %padding = alloca [64 x i32], align 4, addrspace(5) %alloca = alloca [32 x i16], align 2, addrspace(5) %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef @@ -440,6 +801,54 @@ ; GFX10-NEXT: scratch_store_dword off, v0, s0 ; GFX10-NEXT: scratch_load_dword v0, off, s1 ; GFX10-NEXT: s_endpgm +; +; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel: +; GFX9-PAL: ; %bb.0: ; %bb +; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] +; GFX9-PAL-NEXT: s_mov_b32 s4, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 +; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 +; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 +; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX9-PAL-NEXT: s_add_u32 s1, 0x104, s1 +; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 +; GFX9-PAL-NEXT: s_add_u32 s0, 0x104, s0 +; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 +; GFX9-PAL-NEXT: s_endpgm +; +; GFX10-PAL-LABEL: store_load_sindex_small_offset_kernel: +; GFX10-PAL: ; %bb.0: ; %bb +; GFX10-PAL-NEXT: s_getpc_b64 s[4:5] +; GFX10-PAL-NEXT: s_mov_b32 s4, s0 +; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff +; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3 +; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 +; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 +; GFX10-PAL-NEXT: s_add_u32 s0, 0x104, s0 +; GFX10-PAL-NEXT: s_add_u32 s1, 0x104, s1 +; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 +; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 +; GFX10-PAL-NEXT: s_endpgm bb: %padding = alloca [64 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -491,6 +900,51 @@ ; GFX10-NEXT: scratch_store_dword off, v0, s1 ; GFX10-NEXT: scratch_load_dword v0, off, s0 ; GFX10-NEXT: s_endpgm +; +; GFX9-PAL-LABEL: store_load_sindex_small_offset_foo: +; GFX9-PAL: ; %bb.0: ; %bb +; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX9-PAL-NEXT: s_mov_b32 s2, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 +; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 +; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 +; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX9-PAL-NEXT: s_add_u32 s1, 0x104, s1 +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 +; GFX9-PAL-NEXT: s_add_u32 s0, 0x104, s0 +; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 +; GFX9-PAL-NEXT: s_endpgm +; +; GFX10-PAL-LABEL: store_load_sindex_small_offset_foo: +; GFX10-PAL: ; %bb.0: ; %bb +; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX10-PAL-NEXT: s_mov_b32 s2, s0 +; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 +; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 +; GFX10-PAL-NEXT: s_add_u32 s0, 0x104, s0 +; GFX10-PAL-NEXT: s_add_u32 s1, 0x104, s1 +; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 +; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 +; GFX10-PAL-NEXT: s_endpgm bb: %padding = alloca [64 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -539,6 +993,48 @@ ; GFX10-NEXT: scratch_store_dword v2, v3, off ; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 ; GFX10-NEXT: s_endpgm +; +; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel: +; GFX9-PAL: ; %bb.0: ; %bb +; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX9-PAL-NEXT: s_mov_b32 s2, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-PAL-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0x104 +; GFX9-PAL-NEXT: v_add_u32_e32 v2, v1, v0 +; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX9-PAL-NEXT: v_sub_u32_e32 v0, v1, v0 +; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 +; GFX9-PAL-NEXT: s_endpgm +; +; GFX10-PAL-LABEL: store_load_vindex_small_offset_kernel: +; GFX10-PAL: ; %bb.0: ; %bb +; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX10-PAL-NEXT: s_mov_b32 s2, s0 +; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 0x104 +; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX10-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 +; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX10-PAL-NEXT: scratch_load_dword v1, off, off offset:4 +; GFX10-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 +; GFX10-PAL-NEXT: s_endpgm bb: %padding = alloca [64 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -591,6 +1087,40 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo: +; GFX9-PAL: ; %bb.0: ; %bb +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 +; GFX9-PAL-NEXT: s_add_u32 vcc_hi, s32, 0x100 +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_hi +; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 +; GFX9-PAL-NEXT: v_and_b32_e32 v0, v0, v3 +; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-PAL-LABEL: store_load_vindex_small_offset_foo: +; GFX10-PAL: ; %bb.0: ; %bb +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x100 +; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi +; GFX10-PAL-NEXT: v_mov_b32_e32 v2, vcc_lo +; GFX10-PAL-NEXT: v_and_b32_e32 v3, v0, v1 +; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v2 +; GFX10-PAL-NEXT: v_lshl_add_u32 v2, v3, 2, v2 +; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 +; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off +; GFX10-PAL-NEXT: scratch_load_dword v0, v2, off +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] bb: %padding = alloca [64 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -659,6 +1189,68 @@ ; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 ; GFX10-NEXT: s_endpgm +; +; GFX9-PAL-LABEL: zero_init_large_offset_kernel: +; GFX9-PAL: ; %bb.0: +; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX9-PAL-NEXT: s_mov_b32 s2, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-PAL-NEXT: s_mov_b32 s0, 0 +; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 +; GFX9-PAL-NEXT: s_mov_b32 s1, s0 +; GFX9-PAL-NEXT: s_mov_b32 s2, s0 +; GFX9-PAL-NEXT: s_mov_b32 s3, s0 +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi +; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 +; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 +; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 +; GFX9-PAL-NEXT: s_endpgm +; +; GFX10-PAL-LABEL: zero_init_large_offset_kernel: +; GFX10-PAL: ; %bb.0: +; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX10-PAL-NEXT: s_mov_b32 s2, s0 +; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 +; GFX10-PAL-NEXT: s_mov_b32 s0, 0 +; GFX10-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-PAL-NEXT: s_mov_b32 s1, s0 +; GFX10-PAL-NEXT: s_mov_b32 s2, s0 +; GFX10-PAL-NEXT: s_mov_b32 s3, s0 +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi +; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo +; GFX10-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 +; GFX10-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 +; GFX10-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 +; GFX10-PAL-NEXT: s_endpgm %padding = alloca [4096 x i32], align 4, addrspace(5) %alloca = alloca [32 x i16], align 2, addrspace(5) %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef @@ -718,6 +1310,56 @@ ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-PAL-LABEL: zero_init_large_offset_foo: +; GFX9-PAL: ; %bb.0: +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-PAL-NEXT: scratch_load_dword v0, off, s32 +; GFX9-PAL-NEXT: s_mov_b32 s0, 0 +; GFX9-PAL-NEXT: s_mov_b32 s1, s0 +; GFX9-PAL-NEXT: s_mov_b32 s2, s0 +; GFX9-PAL-NEXT: s_mov_b32 s3, s0 +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-PAL-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi +; GFX9-PAL-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 +; GFX9-PAL-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 +; GFX9-PAL-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-PAL-LABEL: zero_init_large_offset_foo: +; GFX10-PAL: ; %bb.0: +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: scratch_load_dword v0, off, s32 +; GFX10-PAL-NEXT: s_mov_b32 s0, 0 +; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-PAL-NEXT: s_mov_b32 s1, s0 +; GFX10-PAL-NEXT: s_mov_b32 s2, s0 +; GFX10-PAL-NEXT: s_mov_b32 s3, s0 +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi +; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo +; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 +; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 +; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] %padding = alloca [4096 x i32], align 4, addrspace(5) %alloca = alloca [32 x i16], align 2, addrspace(5) %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef @@ -766,6 +1408,54 @@ ; GFX10-NEXT: scratch_store_dword off, v0, s0 ; GFX10-NEXT: scratch_load_dword v0, off, s1 ; GFX10-NEXT: s_endpgm +; +; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel: +; GFX9-PAL: ; %bb.0: ; %bb +; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] +; GFX9-PAL-NEXT: s_mov_b32 s4, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 +; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 +; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 +; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX9-PAL-NEXT: s_add_u32 s1, 0x4004, s1 +; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 +; GFX9-PAL-NEXT: s_add_u32 s0, 0x4004, s0 +; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 +; GFX9-PAL-NEXT: s_endpgm +; +; GFX10-PAL-LABEL: store_load_sindex_large_offset_kernel: +; GFX10-PAL: ; %bb.0: ; %bb +; GFX10-PAL-NEXT: s_getpc_b64 s[4:5] +; GFX10-PAL-NEXT: s_mov_b32 s4, s0 +; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff +; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3 +; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 +; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 +; GFX10-PAL-NEXT: s_add_u32 s0, 0x4004, s0 +; GFX10-PAL-NEXT: s_add_u32 s1, 0x4004, s1 +; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 +; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 +; GFX10-PAL-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -817,6 +1507,51 @@ ; GFX10-NEXT: scratch_store_dword off, v0, s1 ; GFX10-NEXT: scratch_load_dword v0, off, s0 ; GFX10-NEXT: s_endpgm +; +; GFX9-PAL-LABEL: store_load_sindex_large_offset_foo: +; GFX9-PAL: ; %bb.0: ; %bb +; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX9-PAL-NEXT: s_mov_b32 s2, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 +; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 +; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 +; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX9-PAL-NEXT: s_add_u32 s1, 0x4004, s1 +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 +; GFX9-PAL-NEXT: s_add_u32 s0, 0x4004, s0 +; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 +; GFX9-PAL-NEXT: s_endpgm +; +; GFX10-PAL-LABEL: store_load_sindex_large_offset_foo: +; GFX10-PAL: ; %bb.0: ; %bb +; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX10-PAL-NEXT: s_mov_b32 s2, s0 +; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 +; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 +; GFX10-PAL-NEXT: s_add_u32 s0, 0x4004, s0 +; GFX10-PAL-NEXT: s_add_u32 s1, 0x4004, s1 +; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 +; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 +; GFX10-PAL-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -865,6 +1600,48 @@ ; GFX10-NEXT: scratch_store_dword v2, v3, off ; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 ; GFX10-NEXT: s_endpgm +; +; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel: +; GFX9-PAL: ; %bb.0: ; %bb +; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX9-PAL-NEXT: s_mov_b32 s2, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-PAL-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 +; GFX9-PAL-NEXT: v_add_u32_e32 v2, v1, v0 +; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX9-PAL-NEXT: v_sub_u32_e32 v0, v1, v0 +; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 +; GFX9-PAL-NEXT: s_endpgm +; +; GFX10-PAL-LABEL: store_load_vindex_large_offset_kernel: +; GFX10-PAL: ; %bb.0: ; %bb +; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX10-PAL-NEXT: s_mov_b32 s2, s0 +; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 +; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX10-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 +; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX10-PAL-NEXT: scratch_load_dword v1, off, off offset:4 +; GFX10-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 +; GFX10-PAL-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -917,6 +1694,40 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo: +; GFX9-PAL: ; %bb.0: ; %bb +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 +; GFX9-PAL-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_hi +; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 +; GFX9-PAL-NEXT: v_and_b32_e32 v0, v0, v3 +; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-PAL-LABEL: store_load_vindex_large_offset_foo: +; GFX10-PAL: ; %bb.0: ; %bb +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi +; GFX10-PAL-NEXT: v_mov_b32_e32 v2, vcc_lo +; GFX10-PAL-NEXT: v_and_b32_e32 v3, v0, v1 +; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v2 +; GFX10-PAL-NEXT: v_lshl_add_u32 v2, v3, 2, v2 +; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 +; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off +; GFX10-PAL-NEXT: scratch_load_dword v0, v2, off +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] bb: %padding = alloca [4096 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -962,6 +1773,45 @@ ; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 ; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 ; GFX10-NEXT: s_endpgm +; +; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel: +; GFX9-PAL: ; %bb.0: ; %bb +; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX9-PAL-NEXT: s_mov_b32 s2, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 +; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 +; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-PAL-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 +; GFX9-PAL-NEXT: s_add_u32 s0, 4, s0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712 +; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712 +; GFX9-PAL-NEXT: s_endpgm +; +; GFX10-PAL-LABEL: store_load_large_imm_offset_kernel: +; GFX10-PAL: ; %bb.0: ; %bb +; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX10-PAL-NEXT: s_mov_b32 s2, s0 +; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 13 +; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX10-PAL-NEXT: s_movk_i32 s0, 0x3800 +; GFX10-PAL-NEXT: s_add_u32 s0, 4, s0 +; GFX10-PAL-NEXT: scratch_store_dword off, v0, off offset:4 +; GFX10-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 +; GFX10-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 +; GFX10-PAL-NEXT: s_endpgm bb: %i = alloca [4096 x i32], align 4, addrspace(5) %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef @@ -1002,6 +1852,35 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-PAL-LABEL: store_load_large_imm_offset_foo: +; GFX9-PAL: ; %bb.0: ; %bb +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 +; GFX9-PAL-NEXT: scratch_store_dword off, v0, s32 +; GFX9-PAL-NEXT: s_add_u32 s0, s32, s0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712 +; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712 +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-PAL-LABEL: store_load_large_imm_offset_foo: +; GFX10-PAL: ; %bb.0: ; %bb +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 13 +; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX10-PAL-NEXT: s_movk_i32 s0, 0x3800 +; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi +; GFX10-PAL-NEXT: s_add_u32 s0, s32, s0 +; GFX10-PAL-NEXT: scratch_store_dword off, v0, s32 +; GFX10-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 +; GFX10-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] bb: %i = alloca [4096 x i32], align 4, addrspace(5) %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef @@ -1042,6 +1921,44 @@ ; GFX10-NEXT: scratch_store_dword v0, v1, off offset:1024 ; GFX10-NEXT: scratch_load_dword v0, v0, off offset:1024 ; GFX10-NEXT: s_endpgm +; +; GFX9-PAL-LABEL: store_load_vidx_sidx_offset: +; GFX9-PAL: ; %bb.0: ; %bb +; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] +; GFX9-PAL-NEXT: s_mov_b32 s4, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4 +; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 +; GFX9-PAL-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 +; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 +; GFX9-PAL-NEXT: s_endpgm +; +; GFX10-PAL-LABEL: store_load_vidx_sidx_offset: +; GFX10-PAL: ; %bb.0: ; %bb +; GFX10-PAL-NEXT: s_getpc_b64 s[4:5] +; GFX10-PAL-NEXT: s_mov_b32 s4, s0 +; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff +; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3 +; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-PAL-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, 4 +; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 +; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 +; GFX10-PAL-NEXT: s_endpgm bb: %alloca = alloca [32 x i32], align 4, addrspace(5) %vidx = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -1076,6 +1993,29 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-PAL-LABEL: store_load_i64_aligned: +; GFX9-PAL: ; %bb.0: ; %bb +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off +; GFX9-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-PAL-LABEL: store_load_i64_aligned: +; GFX10-PAL: ; %bb.0: ; %bb +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi +; GFX10-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off +; GFX10-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] bb: store volatile i64 15, i64 addrspace(5)* %arg, align 8 %load = load volatile i64, i64 addrspace(5)* %arg, align 8 @@ -1105,6 +2045,29 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-PAL-LABEL: store_load_i64_unaligned: +; GFX9-PAL: ; %bb.0: ; %bb +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off +; GFX9-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-PAL-LABEL: store_load_i64_unaligned: +; GFX10-PAL: ; %bb.0: ; %bb +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi +; GFX10-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off +; GFX10-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] bb: store volatile i64 15, i64 addrspace(5)* %arg, align 1 %load = load volatile i64, i64 addrspace(5)* %arg, align 1 @@ -1136,6 +2099,31 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-PAL-LABEL: store_load_v3i32_unaligned: +; GFX9-PAL: ; %bb.0: ; %bb +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 +; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 2 +; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 3 +; GFX9-PAL-NEXT: scratch_store_dwordx3 v0, v[1:3], off +; GFX9-PAL-NEXT: scratch_load_dwordx3 v[0:2], v0, off +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-PAL-LABEL: store_load_v3i32_unaligned: +; GFX10-PAL: ; %bb.0: ; %bb +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 1 +; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 2 +; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 3 +; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi +; GFX10-PAL-NEXT: scratch_store_dwordx3 v0, v[1:3], off +; GFX10-PAL-NEXT: scratch_load_dwordx3 v[0:2], v0, off +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] bb: store volatile <3 x i32> , <3 x i32> addrspace(5)* %arg, align 1 %load = load volatile <3 x i32>, <3 x i32> addrspace(5)* %arg, align 1 @@ -1169,6 +2157,33 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-PAL-LABEL: store_load_v4i32_unaligned: +; GFX9-PAL: ; %bb.0: ; %bb +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 +; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 2 +; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 3 +; GFX9-PAL-NEXT: v_mov_b32_e32 v4, 4 +; GFX9-PAL-NEXT: scratch_store_dwordx4 v0, v[1:4], off +; GFX9-PAL-NEXT: scratch_load_dwordx4 v[0:3], v0, off +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-PAL-LABEL: store_load_v4i32_unaligned: +; GFX10-PAL: ; %bb.0: ; %bb +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 1 +; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 2 +; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 3 +; GFX10-PAL-NEXT: v_mov_b32_e32 v4, 4 +; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi +; GFX10-PAL-NEXT: scratch_store_dwordx4 v0, v[1:4], off +; GFX10-PAL-NEXT: scratch_load_dwordx4 v[0:3], v0, off +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] bb: store volatile <4 x i32> , <4 x i32> addrspace(5)* %arg, align 1 %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %arg, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll --- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll +++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll @@ -6,6 +6,8 @@ ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global,+wavefrontsize64 -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10_W64,GFX9_10,MUBUF,GFX10_W64-MUBUF,GFX9_10-MUBUF %s ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX9_10,FLATSCR,GFX9-FLATSCR %s ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1030 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10_W32,GFX9_10,FLATSCR,GFX10-FLATSCR,GFX9_10-FLATSCR %s +; RUN: llc -march=amdgcn -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX9_10,FLATSCR,GFX9-FLATSCR-PAL %s +; RUN: llc -march=amdgcn -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10_W32,GFX9_10,FLATSCR,GFX10-FLATSCR-PAL,GFX9_10-FLATSCR %s ; RELS: R_AMDGPU_ABS32_LO SCRATCH_RSRC_DWORD0 0x0 ; RELS: R_AMDGPU_ABS32_LO SCRATCH_RSRC_DWORD1 0x0 @@ -25,6 +27,28 @@ ; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[2:3] +; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s2, s0 +; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4 +; GFX9-FLATSCR-PAL-DAG: s_mov_b32 vcc_hi, 0 +; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0) +; GFX9-FLATSCR-PAL-DAG: s_and_b32 s3, s3, 0xffff +; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s2, s0 +; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-FLATSCR-PAL-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0 + +; GFX10-FLATSCR-PAL: s_getpc_b64 s[2:3] +; GFX10-FLATSCR-PAL: s_mov_b32 s2, s0 +; GFX10-FLATSCR-PAL: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0) +; GFX10-FLATSCR-PAL: s_and_b32 s3, s3, 0xffff +; GFX10-FLATSCR-PAL: s_add_u32 s2, s2, s0 +; GFX10-FLATSCR-PAL: s_addc_u32 s3, s3, 0 +; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 + ; MUBUF-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 ; MUBUF-DAG: s_mov_b32 s1, SCRATCH_RSRC_DWORD1 ; MUBUF-DAG: s_mov_b32 s2, -1 @@ -44,6 +68,7 @@ ; MUBUF-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0 ; MUBUF-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]] ; GFX10-FLATSCR: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0 +; GFX10-FLATSCR-PAL: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0 ; GCN-NOT: s_mov_b32 s0 ; GCN-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[HI_OFF:v[0-9]+]],{{.*}} 0x280, [[CLAMP_IDX]] @@ -68,6 +93,27 @@ ; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[2:3] +; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s2, s0 +; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4 +; GFX9-FLATSCR-PAL-DAG: s_mov_b32 vcc_hi, 0 +; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0) +; GFX9-FLATSCR-PAL-DAG: s_and_b32 s3, s3, 0xffff +; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s2, s0 +; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s3, 0 + +; GFX10-FLATSCR-PAL: s_getpc_b64 s[2:3] +; GFX10-FLATSCR-PAL: s_mov_b32 s2, s0 +; GFX10-FLATSCR-PAL: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0) +; GFX10-FLATSCR-PAL: s_and_b32 s3, s3, 0xffff +; GFX10-FLATSCR-PAL: s_add_u32 s2, s2, s0 +; GFX10-FLATSCR-PAL: s_addc_u32 s3, s3, 0 +; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 + ; MUBUF-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 ; GCN-NOT: s_mov_b32 s0 @@ -98,6 +144,27 @@ ; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[2:3] +; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s2, s0 +; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[2:3], s[2:3], 0x10 +; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4 +; GFX9-FLATSCR-PAL-DAG: s_mov_b32 vcc_hi, 0 +; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0) +; GFX9-FLATSCR-PAL-DAG: s_and_b32 s3, s3, 0xffff +; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s2, s0 +; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s3, 0 + +; GFX10-FLATSCR-PAL: s_getpc_b64 s[2:3] +; GFX10-FLATSCR-PAL: s_mov_b32 s2, s0 +; GFX10-FLATSCR-PAL: s_load_dwordx2 s[2:3], s[2:3], 0x10 +; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0) +; GFX10-FLATSCR-PAL: s_and_b32 s3, s3, 0xffff +; GFX10-FLATSCR-PAL: s_add_u32 s2, s2, s0 +; GFX10-FLATSCR-PAL: s_addc_u32 s3, s3, 0 +; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 + ; MUBUF-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 ; FLATSCR-NOT: SCRATCH_RSRC_DWORD @@ -152,6 +219,27 @@ ; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[0:1] +; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, s8 +; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4 +; GFX9-FLATSCR-PAL-DAG: s_mov_b32 vcc_hi, 0 +; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0) +; GFX9-FLATSCR-PAL-DAG: s_and_b32 s1, s1, 0xffff +; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s0, s5 +; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s1, 0 + +; GFX10-FLATSCR-PAL: s_getpc_b64 s[0:1] +; GFX10-FLATSCR-PAL: s_mov_b32 s0, s8 +; GFX10-FLATSCR-PAL: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0) +; GFX10-FLATSCR-PAL: s_and_b32 s1, s1, 0xffff +; GFX10-FLATSCR-PAL: s_add_u32 s0, s0, s5 +; GFX10-FLATSCR-PAL: s_addc_u32 s1, s1, 0 +; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 + ; SIVI: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen @@ -184,6 +272,27 @@ ; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[0:1] +; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, s8 +; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4 +; GFX9-FLATSCR-PAL-DAG: s_mov_b32 vcc_hi, 0 +; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0) +; GFX9-FLATSCR-PAL-DAG: s_and_b32 s1, s1, 0xffff +; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s0, s5 +; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s1, 0 + +; GFX10-FLATSCR-PAL: s_getpc_b64 s[0:1] +; GFX10-FLATSCR-PAL: s_mov_b32 s0, s8 +; GFX10-FLATSCR-PAL: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0) +; GFX10-FLATSCR-PAL: s_and_b32 s1, s1, 0xffff +; GFX10-FLATSCR-PAL: s_add_u32 s0, s0, s5 +; GFX10-FLATSCR-PAL: s_addc_u32 s1, s1, 0 +; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 + ; MUBUF: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; FLATSCR-NOT: SCRATCH_RSRC_DWORD @@ -217,6 +326,27 @@ ; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[0:1] +; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, s8 +; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4 +; GFX9-FLATSCR-PAL-DAG: s_mov_b32 vcc_hi, 0 +; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0) +; GFX9-FLATSCR-PAL-DAG: s_and_b32 s1, s1, 0xffff +; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s0, s5 +; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s1, 0 + +; GFX10-FLATSCR-PAL: s_getpc_b64 s[0:1] +; GFX10-FLATSCR-PAL: s_mov_b32 s0, s8 +; GFX10-FLATSCR-PAL: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0) +; GFX10-FLATSCR-PAL: s_and_b32 s1, s1, 0xffff +; GFX10-FLATSCR-PAL: s_add_u32 s0, s0, s5 +; GFX10-FLATSCR-PAL: s_addc_u32 s1, s1, 0 +; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 + ; MUBUF: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; FLATSCR-NOT: SCRATCH_RSRC_DWORD