diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -172,6 +172,18 @@ "VI SGPR initialization bug requiring a fixed SGPR allocation size" >; +def FeatureNegativeScratchOffsetBug : SubtargetFeature<"negative-scratch-offset-bug", + "NegativeScratchOffsetBug", + "true", + "Negative immediate offsets in scratch instructions with an SGPR offset page fault on GFX9" +>; + +def FeatureNegativeUnalignedScratchOffsetBug : SubtargetFeature<"negative-unaligned-scratch-offset-bug", + "NegativeUnalignedScratchOffsetBug", + "true", + "Scratch instructions with a VGPR offset and a negative immediate offset that is not a multiple of 4 read wrong memory on GFX10" +>; + def FeatureLdsMisalignedBug : SubtargetFeature<"lds-misaligned-bug", "LDSMisalignedBug", "true", @@ -771,7 +783,8 @@ FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts, FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK, - FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess + FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, + FeatureNegativeScratchOffsetBug ] >; @@ -988,7 +1001,8 @@ FeatureLdsBranchVmemWARHazard, FeatureNSAtoVMEMBug, FeatureOffset3fBug, - FeatureFlatSegmentOffsetBug + FeatureFlatSegmentOffsetBug, + FeatureNegativeUnalignedScratchOffsetBug ]; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1669,7 +1669,7 @@ if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) { SDValue N0, N1; if (isBaseWithConstantOffset64(Addr, N0, N1)) { - uint64_t COffsetVal = cast(N1)->getSExtValue(); + int64_t COffsetVal = cast(N1)->getSExtValue(); const SIInstrInfo *TII = Subtarget->getInstrInfo(); if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) { @@ -1911,17 +1911,11 @@ if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch)) { - const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(*Subtarget, true); - // Use signed division by a power of two to truncate towards 0. - int64_t D = 1LL << (NumBits - 1); - int64_t RemainderOffset = (COffsetVal / D) * D; - int64_t ImmField = COffsetVal - RemainderOffset; + int64_t SplitImmOffset, RemainderOffset; + std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset( + COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch); - assert(TII->isLegalFLATOffset(ImmField, AMDGPUAS::PRIVATE_ADDRESS, - SIInstrFlags::FlatScratch)); - assert(RemainderOffset + ImmField == COffsetVal); - - COffsetVal = ImmField; + COffsetVal = SplitImmOffset; SDLoc DL(N); SDValue AddOffset = diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -195,7 +195,8 @@ { } GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, - const GCNTargetMachine &TM) : + const GCNTargetMachine &TM) + : // clang-format off AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), AMDGPUSubtarget(TT), TargetTriple(TT), @@ -238,6 +239,8 @@ GFX10_3Insts(false), GFX7GFX8GFX9Insts(false), SGPRInitBug(false), + NegativeScratchOffsetBug(false), + NegativeUnalignedScratchOffsetBug(false), HasSMemRealTime(false), HasIntClamp(false), HasFmaMixInsts(false), @@ -312,6 +315,7 @@ InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), TLInfo(TM, *this), FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { + // clang-format on MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -113,6 +113,8 @@ bool GFX10_3Insts; bool GFX7GFX8GFX9Insts; bool SGPRInitBug; + bool NegativeScratchOffsetBug; + bool NegativeUnalignedScratchOffsetBug; bool HasSMemRealTime; bool HasIntClamp; bool HasFmaMixInsts; @@ -890,6 +892,12 @@ return SGPRInitBug; } + bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; } + + bool hasNegativeUnalignedScratchOffsetBug() const { + return NegativeUnalignedScratchOffsetBug; + } + bool hasMFMAInlineLiteralBug() const { return HasMFMAInlineLiteralBug; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -7368,6 +7368,36 @@ return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass); } +// Depending on the used address space and instructions, some immediate offsets +// are allowed and some are not. +// In general, flat instructions can only be unsigned, global and scratch +// instructions can also be negative. +// +// There are several bugs related to these offsets: +// On gfx10.1, flat instructions that go into the global address space cannot +// use an offset. +// +// For scratch instructions, the address can be either an SGPR or a VGPR. +// The following offsets can be used, depending on the architecture (x means +// cannot be used): +// +----------------------------+------+------+ +// | Address-Mode | SGPR | VGPR | +// +----------------------------+------+------+ +// | gfx9 | | | +// | negative, 4-aligned offset | x | ok | +// | negative, unaligned offset | x | ok | +// +----------------------------+------+------+ +// | gfx10 | | | +// | negative, 4-aligned offset | ok | ok | +// | negative, unaligned offset | ok | x | +// +----------------------------+------+------+ +// | gfx10.3 | | | +// | negative, 4-aligned offset | ok | ok | +// | negative, unaligned offset | ok | ok | +// +----------------------------+------+------+ +// +// This function ignores the addressing mode, so if an offset cannot be used in +// one addressing mode, it is considered illegal. bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const { // TODO: Should 0 be special cased? @@ -7380,22 +7410,44 @@ return false; bool Signed = FlatVariant != SIInstrFlags::FLAT; + if (ST.hasNegativeScratchOffsetBug() && + FlatVariant == SIInstrFlags::FlatScratch) + Signed = false; + if (ST.hasNegativeUnalignedScratchOffsetBug() && + FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 && + (Offset % 4) != 0) { + return false; + } + unsigned N = AMDGPU::getNumFlatOffsetBits(ST, Signed); return Signed ? isIntN(N, Offset) : isUIntN(N, Offset); } +// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not. std::pair SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const { int64_t RemainderOffset = COffsetVal; int64_t ImmField = 0; bool Signed = FlatVariant != SIInstrFlags::FLAT; + if (ST.hasNegativeScratchOffsetBug() && + FlatVariant == SIInstrFlags::FlatScratch) + Signed = false; + const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST, Signed); if (Signed) { // Use signed division by a power of two to truncate towards 0. int64_t D = 1LL << (NumBits - 1); RemainderOffset = (COffsetVal / D) * D; ImmField = COffsetVal - RemainderOffset; + + if (ST.hasNegativeUnalignedScratchOffsetBug() && + FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 && + (ImmField % 4) != 0) { + // Make ImmField a multiple of 4 + RemainderOffset += ImmField % 4; + ImmField -= ImmField % 4; + } } else if (COffsetVal >= 0) { ImmField = COffsetVal & maskTrailingOnes(NumBits); RemainderOffset = COffsetVal - ImmField; diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -2,7 +2,8 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9-PAL %s -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10-PAL %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1010-PAL %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1030-PAL %s define amdgpu_kernel void @zero_init_kernel() { ; GFX9-LABEL: zero_init_kernel: @@ -74,30 +75,62 @@ ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 ; GFX9-PAL-NEXT: s_endpgm ; -; GFX10-PAL-LABEL: zero_init_kernel: -; GFX10-PAL: ; %bb.0: -; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] -; GFX10-PAL-NEXT: s_mov_b32 s2, s0 -; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 -; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX10-PAL-NEXT: s_mov_b32 s0, 0 -; GFX10-PAL-NEXT: s_mov_b32 s1, s0 -; GFX10-PAL-NEXT: s_mov_b32 s2, s0 -; GFX10-PAL-NEXT: s_mov_b32 s3, s0 -; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:64 -; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48 -; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 -; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16 -; GFX10-PAL-NEXT: s_endpgm +; GFX1010-PAL-LABEL: zero_init_kernel: +; GFX1010-PAL: ; %bb.0: +; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 +; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 +; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 +; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:64 +; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 +; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 +; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 +; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 +; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 +; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 +; GFX1010-PAL-NEXT: s_endpgm +; +; GFX1030-PAL-LABEL: zero_init_kernel: +; GFX1030-PAL: ; %bb.0: +; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 +; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 +; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:64 +; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48 +; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 +; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16 +; GFX1030-PAL-NEXT: s_endpgm %alloca = alloca [32 x i16], align 2, addrspace(5) %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) @@ -650,32 +683,67 @@ ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320 ; GFX9-PAL-NEXT: s_endpgm ; -; GFX10-PAL-LABEL: zero_init_small_offset_kernel: -; GFX10-PAL: ; %bb.0: -; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] -; GFX10-PAL-NEXT: s_mov_b32 s2, s0 -; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 -; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc -; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: s_mov_b32 s0, 0 -; GFX10-PAL-NEXT: s_mov_b32 s1, s0 -; GFX10-PAL-NEXT: s_mov_b32 s2, s0 -; GFX10-PAL-NEXT: s_mov_b32 s3, s0 -; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272 -; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288 -; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304 -; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:320 -; GFX10-PAL-NEXT: s_endpgm +; GFX1010-PAL-LABEL: zero_init_small_offset_kernel: +; GFX1010-PAL: ; %bb.0: +; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 +; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 +; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc +; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 +; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 +; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:272 +; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 +; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:288 +; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 +; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:304 +; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 +; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:320 +; GFX1010-PAL-NEXT: s_endpgm +; +; GFX1030-PAL-LABEL: zero_init_small_offset_kernel: +; GFX1030-PAL: ; %bb.0: +; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc +; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 +; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 +; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272 +; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288 +; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304 +; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:320 +; GFX1030-PAL-NEXT: s_endpgm %padding = alloca [64 x i32], align 4, addrspace(5) %alloca = alloca [32 x i16], align 2, addrspace(5) %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef @@ -844,32 +912,60 @@ ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; -; GFX10-PAL-LABEL: store_load_sindex_small_offset_kernel: -; GFX10-PAL: ; %bb.0: ; %bb -; GFX10-PAL-NEXT: s_getpc_b64 s[4:5] -; GFX10-PAL-NEXT: s_mov_b32 s4, s0 -; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff -; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3 -; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 -; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc -; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 -; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 -; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 -; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 -; GFX10-PAL-NEXT: s_add_u32 s0, 0x104, s0 -; GFX10-PAL-NEXT: s_add_u32 s1, 0x104, s1 -; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 -; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc -; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: s_endpgm +; GFX1010-PAL-LABEL: store_load_sindex_small_offset_kernel: +; GFX1010-PAL: ; %bb.0: ; %bb +; GFX1010-PAL-NEXT: s_getpc_b64 s[4:5] +; GFX1010-PAL-NEXT: s_mov_b32 s4, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-PAL-NEXT: s_and_b32 s5, s5, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s4, s4, s3 +; GFX1010-PAL-NEXT: s_addc_u32 s5, s5, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX1010-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 +; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc +; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15 +; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2 +; GFX1010-PAL-NEXT: s_add_u32 s0, 0x104, s0 +; GFX1010-PAL-NEXT: s_add_u32 s1, 0x104, s1 +; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0 +; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc +; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-PAL-NEXT: s_endpgm +; +; GFX1030-PAL-LABEL: store_load_sindex_small_offset_kernel: +; GFX1030-PAL: ; %bb.0: ; %bb +; GFX1030-PAL-NEXT: s_getpc_b64 s[4:5] +; GFX1030-PAL-NEXT: s_mov_b32 s4, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1030-PAL-NEXT: s_and_b32 s5, s5, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s4, s4, s3 +; GFX1030-PAL-NEXT: s_addc_u32 s5, s5, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX1030-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc +; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 +; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2 +; GFX1030-PAL-NEXT: s_add_u32 s0, 0x104, s0 +; GFX1030-PAL-NEXT: s_add_u32 s1, 0x104, s1 +; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0 +; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc +; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1030-PAL-NEXT: s_endpgm bb: %padding = alloca [64 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -950,30 +1046,56 @@ ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; -; GFX10-PAL-LABEL: store_load_sindex_small_offset_foo: -; GFX10-PAL: ; %bb.0: ; %bb -; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] -; GFX10-PAL-NEXT: s_mov_b32 s2, s0 -; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 -; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc -; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 -; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 -; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 -; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 -; GFX10-PAL-NEXT: s_add_u32 s0, 0x104, s0 -; GFX10-PAL-NEXT: s_add_u32 s1, 0x104, s1 -; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 -; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc -; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: s_endpgm +; GFX1010-PAL-LABEL: store_load_sindex_small_offset_foo: +; GFX1010-PAL: ; %bb.0: ; %bb +; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 +; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15 +; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc +; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2 +; GFX1010-PAL-NEXT: s_add_u32 s0, 0x104, s0 +; GFX1010-PAL-NEXT: s_add_u32 s1, 0x104, s1 +; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0 +; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc +; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-PAL-NEXT: s_endpgm +; +; GFX1030-PAL-LABEL: store_load_sindex_small_offset_foo: +; GFX1030-PAL: ; %bb.0: ; %bb +; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc +; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2 +; GFX1030-PAL-NEXT: s_add_u32 s0, 0x104, s0 +; GFX1030-PAL-NEXT: s_add_u32 s1, 0x104, s1 +; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0 +; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc +; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1030-PAL-NEXT: s_endpgm bb: %padding = alloca [64 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -1051,29 +1173,54 @@ ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; -; GFX10-PAL-LABEL: store_load_vindex_small_offset_kernel: -; GFX10-PAL: ; %bb.0: ; %bb -; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] -; GFX10-PAL-NEXT: s_mov_b32 s2, s0 -; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 -; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 0x104 -; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX10-PAL-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc -; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: scratch_store_dword v2, v3, off -; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc -; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: s_endpgm +; GFX1010-PAL-LABEL: store_load_vindex_small_offset_kernel: +; GFX1010-PAL: ; %bb.0: ; %bb +; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x104 +; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 +; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 +; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX1010-PAL-NEXT: scratch_load_dword v1, off, vcc_lo offset:4 glc dlc +; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc +; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-PAL-NEXT: s_endpgm +; +; GFX1030-PAL-LABEL: store_load_vindex_small_offset_kernel: +; GFX1030-PAL: ; %bb.0: ; %bb +; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x104 +; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 +; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX1030-PAL-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc +; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1030-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc +; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1030-PAL-NEXT: s_endpgm bb: %padding = alloca [64 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -1260,36 +1407,71 @@ ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 ; GFX9-PAL-NEXT: s_endpgm ; -; GFX10-PAL-LABEL: zero_init_large_offset_kernel: -; GFX10-PAL: ; %bb.0: -; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] -; GFX10-PAL-NEXT: s_mov_b32 s2, s0 -; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 -; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:16 glc dlc -; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: s_mov_b32 s0, 0 -; GFX10-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 -; GFX10-PAL-NEXT: s_mov_b32 s1, s0 -; GFX10-PAL-NEXT: s_mov_b32 s2, s0 -; GFX10-PAL-NEXT: s_mov_b32 s3, s0 -; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo -; GFX10-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 -; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 -; GFX10-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 -; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 -; GFX10-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 -; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 -; GFX10-PAL-NEXT: s_endpgm +; GFX1010-PAL-LABEL: zero_init_large_offset_kernel: +; GFX1010-PAL: ; %bb.0: +; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 +; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 +; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:16 glc dlc +; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 +; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo +; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 +; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 +; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 +; GFX1010-PAL-NEXT: s_endpgm +; +; GFX1030-PAL-LABEL: zero_init_large_offset_kernel: +; GFX1030-PAL: ; %bb.0: +; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:16 glc dlc +; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 +; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 +; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo +; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 +; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 +; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 +; GFX1030-PAL-NEXT: s_endpgm %padding = alloca [4096 x i32], align 4, addrspace(5) %alloca = alloca [32 x i16], align 2, addrspace(5) %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef @@ -1373,30 +1555,58 @@ ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-PAL-LABEL: zero_init_large_offset_foo: -; GFX10-PAL: ; %bb.0: -; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-PAL-NEXT: scratch_load_dword v0, off, s32 glc dlc -; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: s_mov_b32 s0, 0 -; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000 -; GFX10-PAL-NEXT: s_mov_b32 s1, s0 -; GFX10-PAL-NEXT: s_mov_b32 s2, s0 -; GFX10-PAL-NEXT: s_mov_b32 s3, s0 -; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo -; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000 -; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 -; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000 -; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 -; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000 -; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 -; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] +; GFX1010-PAL-LABEL: zero_init_large_offset_foo: +; GFX1010-PAL: ; %bb.0: +; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s32 glc dlc +; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 +; GFX1010-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 +; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo +; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1010-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 +; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1010-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 +; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1010-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 +; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1010-PAL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1030-PAL-LABEL: zero_init_large_offset_foo: +; GFX1030-PAL: ; %bb.0: +; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s32 glc dlc +; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 +; GFX1030-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 +; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo +; GFX1030-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 +; GFX1030-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 +; GFX1030-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 +; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1030-PAL-NEXT: s_setpc_b64 s[30:31] %padding = alloca [4096 x i32], align 4, addrspace(5) %alloca = alloca [32 x i16], align 2, addrspace(5) %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef @@ -1474,32 +1684,60 @@ ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; -; GFX10-PAL-LABEL: store_load_sindex_large_offset_kernel: -; GFX10-PAL: ; %bb.0: ; %bb -; GFX10-PAL-NEXT: s_getpc_b64 s[4:5] -; GFX10-PAL-NEXT: s_mov_b32 s4, s0 -; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff -; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3 -; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 -; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc -; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 -; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 -; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 -; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 -; GFX10-PAL-NEXT: s_add_u32 s0, 0x4004, s0 -; GFX10-PAL-NEXT: s_add_u32 s1, 0x4004, s1 -; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 -; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc -; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: s_endpgm +; GFX1010-PAL-LABEL: store_load_sindex_large_offset_kernel: +; GFX1010-PAL: ; %bb.0: ; %bb +; GFX1010-PAL-NEXT: s_getpc_b64 s[4:5] +; GFX1010-PAL-NEXT: s_mov_b32 s4, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-PAL-NEXT: s_and_b32 s5, s5, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s4, s4, s3 +; GFX1010-PAL-NEXT: s_addc_u32 s5, s5, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX1010-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 +; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc +; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15 +; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2 +; GFX1010-PAL-NEXT: s_add_u32 s0, 0x4004, s0 +; GFX1010-PAL-NEXT: s_add_u32 s1, 0x4004, s1 +; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0 +; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc +; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-PAL-NEXT: s_endpgm +; +; GFX1030-PAL-LABEL: store_load_sindex_large_offset_kernel: +; GFX1030-PAL: ; %bb.0: ; %bb +; GFX1030-PAL-NEXT: s_getpc_b64 s[4:5] +; GFX1030-PAL-NEXT: s_mov_b32 s4, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1030-PAL-NEXT: s_and_b32 s5, s5, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s4, s4, s3 +; GFX1030-PAL-NEXT: s_addc_u32 s5, s5, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX1030-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc +; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 +; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2 +; GFX1030-PAL-NEXT: s_add_u32 s0, 0x4004, s0 +; GFX1030-PAL-NEXT: s_add_u32 s1, 0x4004, s1 +; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0 +; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc +; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1030-PAL-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -1580,30 +1818,56 @@ ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; -; GFX10-PAL-LABEL: store_load_sindex_large_offset_foo: -; GFX10-PAL: ; %bb.0: ; %bb -; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] -; GFX10-PAL-NEXT: s_mov_b32 s2, s0 -; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 -; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc -; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 -; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 -; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 -; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 -; GFX10-PAL-NEXT: s_add_u32 s0, 0x4004, s0 -; GFX10-PAL-NEXT: s_add_u32 s1, 0x4004, s1 -; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 -; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc -; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: s_endpgm +; GFX1010-PAL-LABEL: store_load_sindex_large_offset_foo: +; GFX1010-PAL: ; %bb.0: ; %bb +; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 +; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15 +; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc +; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2 +; GFX1010-PAL-NEXT: s_add_u32 s0, 0x4004, s0 +; GFX1010-PAL-NEXT: s_add_u32 s1, 0x4004, s1 +; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0 +; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc +; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-PAL-NEXT: s_endpgm +; +; GFX1030-PAL-LABEL: store_load_sindex_large_offset_foo: +; GFX1030-PAL: ; %bb.0: ; %bb +; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc +; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2 +; GFX1030-PAL-NEXT: s_add_u32 s0, 0x4004, s0 +; GFX1030-PAL-NEXT: s_add_u32 s1, 0x4004, s1 +; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0 +; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc +; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1030-PAL-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -1681,29 +1945,54 @@ ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; -; GFX10-PAL-LABEL: store_load_vindex_large_offset_kernel: -; GFX10-PAL: ; %bb.0: ; %bb -; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] -; GFX10-PAL-NEXT: s_mov_b32 s2, s0 -; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 -; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 -; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX10-PAL-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc -; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: scratch_store_dword v2, v3, off -; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc -; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: s_endpgm +; GFX1010-PAL-LABEL: store_load_vindex_large_offset_kernel: +; GFX1010-PAL: ; %bb.0: ; %bb +; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 +; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 +; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 +; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX1010-PAL-NEXT: scratch_load_dword v1, off, vcc_lo offset:4 glc dlc +; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc +; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-PAL-NEXT: s_endpgm +; +; GFX1030-PAL-LABEL: store_load_vindex_large_offset_kernel: +; GFX1030-PAL: ; %bb.0: ; %bb +; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 +; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 +; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX1030-PAL-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc +; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1030-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc +; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1030-PAL-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -1866,28 +2155,52 @@ ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; -; GFX10-PAL-LABEL: store_load_large_imm_offset_kernel: -; GFX10-PAL: ; %bb.0: ; %bb -; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] -; GFX10-PAL-NEXT: s_mov_b32 s2, s0 -; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 -; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 13 -; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 -; GFX10-PAL-NEXT: s_movk_i32 s0, 0x3800 -; GFX10-PAL-NEXT: s_add_u32 s0, 4, s0 -; GFX10-PAL-NEXT: scratch_store_dword off, v0, off offset:4 -; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 -; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc -; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: s_endpgm +; GFX1010-PAL-LABEL: store_load_large_imm_offset_kernel: +; GFX1010-PAL: ; %bb.0: ; %bb +; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 13 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x3800 +; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 +; GFX1010-PAL-NEXT: s_add_u32 s0, 4, s0 +; GFX1010-PAL-NEXT: scratch_store_dword off, v0, vcc_lo offset:4 +; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1010-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 +; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc +; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-PAL-NEXT: s_endpgm +; +; GFX1030-PAL-LABEL: store_load_large_imm_offset_kernel: +; GFX1030-PAL: ; %bb.0: ; %bb +; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 13 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x3800 +; GFX1030-PAL-NEXT: s_add_u32 s0, 4, s0 +; GFX1030-PAL-NEXT: scratch_store_dword off, v0, off offset:4 +; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1030-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 +; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc +; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1030-PAL-NEXT: s_endpgm bb: %i = alloca [4096 x i32], align 4, addrspace(5) %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef @@ -2278,5 +2591,133 @@ ret void } +define void @store_load_i32_negative_unaligned(i8 addrspace(5)* nocapture %arg) { +; GFX9-LABEL: store_load_i32_negative_unaligned: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, -1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 1 +; GFX9-NEXT: scratch_store_byte v0, v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: scratch_load_ubyte v0, v0, off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: store_load_i32_negative_unaligned: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v1, 1 +; GFX10-NEXT: scratch_store_byte v0, v1, off offset:-1 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:-1 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-PAL-LABEL: store_load_i32_negative_unaligned: +; GFX9-PAL: ; %bb.0: ; %bb +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-PAL-NEXT: v_add_u32_e32 v0, -1, v0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 +; GFX9-PAL-NEXT: scratch_store_byte v0, v1, off +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: scratch_load_ubyte v0, v0, off glc +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-PAL-LABEL: store_load_i32_negative_unaligned: +; GFX1010-PAL: ; %bb.0: ; %bb +; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v0, -1, v0 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 1 +; GFX1010-PAL-NEXT: scratch_store_byte v0, v1, off +; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1010-PAL-NEXT: scratch_load_ubyte v0, v0, off glc dlc +; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-PAL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1030-PAL-LABEL: store_load_i32_negative_unaligned: +; GFX1030-PAL: ; %bb.0: ; %bb +; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 1 +; GFX1030-PAL-NEXT: scratch_store_byte v0, v1, off offset:-1 +; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1030-PAL-NEXT: scratch_load_ubyte v0, v0, off offset:-1 glc dlc +; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1030-PAL-NEXT: s_setpc_b64 s[30:31] +bb: + %ptr = getelementptr inbounds i8, i8 addrspace(5)* %arg, i32 -1 + store volatile i8 1, i8 addrspace(5)* %ptr, align 1 + %load = load volatile i8, i8 addrspace(5)* %ptr, align 1 + ret void +} + +define void @store_load_i32_large_negative_unaligned(i8 addrspace(5)* nocapture %arg) { +; GFX9-LABEL: store_load_i32_large_negative_unaligned: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, 0xffffef7f, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 1 +; GFX9-NEXT: scratch_store_byte v0, v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: scratch_load_ubyte v0, v0, off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: store_load_i32_large_negative_unaligned: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 0xfffff000, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 1 +; GFX10-NEXT: scratch_store_byte v0, v1, off offset:-129 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:-129 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-PAL-LABEL: store_load_i32_large_negative_unaligned: +; GFX9-PAL: ; %bb.0: ; %bb +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-PAL-NEXT: v_add_u32_e32 v0, 0xffffef7f, v0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 +; GFX9-PAL-NEXT: scratch_store_byte v0, v1, off +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: scratch_load_ubyte v0, v0, off glc +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-PAL-LABEL: store_load_i32_large_negative_unaligned: +; GFX1010-PAL: ; %bb.0: ; %bb +; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v0, 0xffffefff, v0 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 1 +; GFX1010-PAL-NEXT: scratch_store_byte v0, v1, off offset:-128 +; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1010-PAL-NEXT: scratch_load_ubyte v0, v0, off offset:-128 glc dlc +; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-PAL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1030-PAL-LABEL: store_load_i32_large_negative_unaligned: +; GFX1030-PAL: ; %bb.0: ; %bb +; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v0, 0xfffff000, v0 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 1 +; GFX1030-PAL-NEXT: scratch_store_byte v0, v1, off offset:-129 +; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1030-PAL-NEXT: scratch_load_ubyte v0, v0, off offset:-129 glc dlc +; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1030-PAL-NEXT: s_setpc_b64 s[30:31] +bb: + %ptr = getelementptr inbounds i8, i8 addrspace(5)* %arg, i32 -4225 + store volatile i8 1, i8 addrspace(5)* %ptr, align 1 + %load = load volatile i8, i8 addrspace(5)* %ptr, align 1 + ret void +} + declare void @llvm.memset.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8, i64, i1 immarg) declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll @@ -279,30 +279,31 @@ ; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5 ; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 -; FLATSCR-NEXT: s_add_u32 s2, 16, 0x4000 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 -; FLATSCR-NEXT: s_movk_i32 vcc_hi, 0x2000 -; FLATSCR-NEXT: s_mov_b32 s3, 0 -; FLATSCR-NEXT: scratch_store_dword off, v0, vcc_hi +; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 +; FLATSCR-NEXT: s_mov_b32 s2, 0 +; FLATSCR-NEXT: scratch_store_dword off, v0, vcc_hi offset:1024 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: BB2_1: ; %loadstoreloop ; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 -; FLATSCR-NEXT: s_add_u32 s4, 0x4000, s3 -; FLATSCR-NEXT: s_add_i32 s3, s3, 1 -; FLATSCR-NEXT: s_cmpk_lt_u32 s3, 0x2120 -; FLATSCR-NEXT: scratch_store_byte off, v0, s4 +; FLATSCR-NEXT: s_add_u32 s3, 0x2000, s2 +; FLATSCR-NEXT: s_add_i32 s2, s2, 1 +; FLATSCR-NEXT: s_cmpk_lt_u32 s2, 0x2120 +; FLATSCR-NEXT: scratch_store_byte off, v0, s3 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_cbranch_scc1 BB2_1 ; FLATSCR-NEXT: ; %bb.2: ; %split -; FLATSCR-NEXT: s_movk_i32 s3, 0x1000 -; FLATSCR-NEXT: s_add_u32 s3, 0x4000, s3 -; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s3 offset:720 glc +; FLATSCR-NEXT: s_movk_i32 s2, 0x1000 +; FLATSCR-NEXT: s_add_u32 s2, 0x2000, s2 +; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s2 offset:720 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s3 offset:704 glc +; FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 offset:704 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: scratch_load_dwordx2 v[10:11], off, s2 glc +; FLATSCR-NEXT: s_movk_i32 s2, 0x2000 +; FLATSCR-NEXT: scratch_load_dwordx2 v[10:11], off, s2 offset:16 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: scratch_load_dwordx4 v[4:7], off, s2 offset:-16 glc +; FLATSCR-NEXT: s_movk_i32 s2, 0x2000 +; FLATSCR-NEXT: scratch_load_dwordx4 v[4:7], off, s2 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_mov_b32_e32 v12, 0 ; FLATSCR-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6