diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -158,6 +158,9 @@ bool isDSOffsetLegal(SDValue Base, unsigned Offset) const; bool isDSOffset2Legal(SDValue Base, unsigned Offset0, unsigned Offset1, unsigned Size) const; + bool isFlatScratchBaseLegal( + SDValue Base, uint64_t FlatVariant = SIInstrFlags::FlatScratch) const; + bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const; bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0, SDValue &Offset1) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1135,6 +1135,15 @@ return CurDAG->SignBitIsZero(Base); } +bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Base, + uint64_t FlatVariant) const { + if (FlatVariant != SIInstrFlags::FlatScratch) + return true; + // When value in 32-bit Base can be negative calculate scratch offset using + // 32-bit add instruction, otherwise use Base(unsigned) + offset. + return CurDAG->SignBitIsZero(Base); +} + // TODO: If offset is too big, put low 16-bit into offset. bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, SDValue &Offset0, @@ -1760,7 +1769,8 @@ int64_t COffsetVal = 0; - if (CurDAG->isBaseWithConstantOffset(Addr)) { + if (CurDAG->isBaseWithConstantOffset(Addr) && + isFlatScratchBaseLegal(Addr.getOperand(0))) { COffsetVal = cast(Addr.getOperand(1))->getSExtValue(); SAddr = Addr.getOperand(0); } else { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -13,6 +13,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRUCTIONSELECTOR_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRUCTIONSELECTOR_H +#include "SIDefines.h" #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" #include "llvm/IR/InstrTypes.h" @@ -236,6 +237,8 @@ bool isDSOffsetLegal(Register Base, int64_t Offset) const; bool isDSOffset2Legal(Register Base, int64_t Offset0, int64_t Offset1, unsigned Size) const; + bool isFlatScratchBaseLegal( + Register Base, uint64_t FlatVariant = SIInstrFlags::FlatScratch) const; std::pair selectDS1Addr1OffsetImpl(MachineOperand &Root) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4010,7 +4010,7 @@ // possible. std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); - if (ConstOffset != 0 && + if (ConstOffset != 0 && isFlatScratchBaseLegal(PtrBase) && TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch)) { Addr = PtrBase; @@ -4234,6 +4234,16 @@ return KnownBits->signBitIsZero(Base); } +bool AMDGPUInstructionSelector::isFlatScratchBaseLegal( + Register Base, uint64_t FlatVariant) const { + if (FlatVariant != SIInstrFlags::FlatScratch) + return true; + + // When value in 32-bit Base can be negative calculate scratch offset using + // 32-bit add instruction, otherwise use Base(unsigned) + offset. + return KnownBits->signBitIsZero(Base); +} + bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI, unsigned ShAmtBits) const { assert(MI.getOpcode() == TargetOpcode::G_AND); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll @@ -33,20 +33,24 @@ ; ; FLATSCR-LABEL: kernel_caller_stack: ; FLATSCR: ; %bb.0: -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; FLATSCR-NEXT: s_add_u32 s0, s32, 4 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 9 -; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:4 +; FLATSCR-NEXT: scratch_store_dword off, v0, s0 +; FLATSCR-NEXT: s_add_u32 s0, s32, 8 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 10 -; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:8 +; FLATSCR-NEXT: scratch_store_dword off, v0, s0 +; FLATSCR-NEXT: s_add_u32 s0, s32, 12 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 11 -; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:12 +; FLATSCR-NEXT: scratch_store_dword off, v0, s0 +; FLATSCR-NEXT: s_add_u32 s2, s32, 16 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 12 ; FLATSCR-NEXT: s_getpc_b64 s[0:1] ; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_v16i32_v16i32_v4i32@rel32@lo+4 ; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12 -; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:16 +; FLATSCR-NEXT: scratch_store_dword off, v0, s2 ; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] ; FLATSCR-NEXT: s_endpgm call void @external_void_func_v16i32_v16i32_v4i32(<16 x i32> undef, <16 x i32> undef, <4 x i32> ) @@ -155,42 +159,40 @@ ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 ; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 +; FLATSCR-NEXT: s_mov_b32 s8, 0 +; FLATSCR-NEXT: s_mov_b32 s10, 0 +; FLATSCR-NEXT: s_mov_b32 s9, 0 ; FLATSCR-NEXT: s_mov_b32 vcc_lo, 0 ; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 ; FLATSCR-NEXT: s_mov_b32 s11, 0 -; FLATSCR-NEXT: s_mov_b32 s10, 0 -; FLATSCR-NEXT: s_mov_b32 s9, 0 -; FLATSCR-NEXT: s_mov_b32 s8, 0 ; FLATSCR-NEXT: s_mov_b32 s7, 0 -; FLATSCR-NEXT: s_mov_b32 s6, 0 ; FLATSCR-NEXT: s_mov_b32 s5, 0 +; FLATSCR-NEXT: s_mov_b32 s3, 0 ; FLATSCR-NEXT: s_mov_b32 s1, 0 ; FLATSCR-NEXT: s_mov_b32 s0, 0 -; FLATSCR-NEXT: s_mov_b32 s4, 0 -; FLATSCR-NEXT: s_mov_b32 s3, 0 ; FLATSCR-NEXT: s_mov_b32 s2, 0 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_lo offset:8 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:16 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s11 offset:24 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s10 offset:32 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s9 offset:40 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s8 offset:48 +; FLATSCR-NEXT: s_mov_b32 s4, 0 +; FLATSCR-NEXT: s_mov_b32 s6, 0 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s8 offset:8 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s10 offset:16 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s9 offset:24 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_lo offset:32 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:40 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s11 offset:48 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s7 offset:56 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s6 offset:64 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s5 offset:72 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s5 offset:64 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s3 offset:72 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s1 offset:80 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:88 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s4 offset:96 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s3 offset:104 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s2 offset:112 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_lo offset:120 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:128 -; FLATSCR-NEXT: s_mov_b32 s40, 0 -; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s40 offset:8 -; FLATSCR-NEXT: s_mov_b32 s39, 0 -; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s39 offset:16 -; FLATSCR-NEXT: s_mov_b32 s38, 0 -; FLATSCR-NEXT: scratch_load_dwordx2 v[4:5], off, s38 offset:24 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s2 offset:96 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s4 offset:104 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s6 offset:112 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s8 offset:120 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s10 offset:128 +; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s9 offset:8 +; FLATSCR-NEXT: s_nop 0 +; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, vcc_lo offset:16 +; FLATSCR-NEXT: scratch_load_dwordx2 v[4:5], off, vcc_hi offset:24 ; FLATSCR-NEXT: s_mov_b32 s37, 0 ; FLATSCR-NEXT: scratch_load_dwordx2 v[6:7], off, s37 offset:32 ; FLATSCR-NEXT: s_mov_b32 s36, 0 @@ -205,22 +207,29 @@ ; FLATSCR-NEXT: s_getpc_b64 s[0:1] ; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_byval@rel32@lo+4 ; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_byval@rel32@hi+12 +; FLATSCR-NEXT: s_add_u32 s2, s32, 8 +; FLATSCR-NEXT: s_add_u32 s3, s32, 16 +; FLATSCR-NEXT: s_add_u32 s4, s32, 24 +; FLATSCR-NEXT: s_add_u32 s5, s32, 32 +; FLATSCR-NEXT: s_add_u32 s6, s32, 40 +; FLATSCR-NEXT: s_add_u32 s7, s32, 48 +; FLATSCR-NEXT: s_add_u32 s8, s32, 56 ; FLATSCR-NEXT: s_waitcnt vmcnt(7) ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s32 ; FLATSCR-NEXT: s_waitcnt vmcnt(7) -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[2:3], s32 offset:8 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[2:3], s2 ; FLATSCR-NEXT: s_waitcnt vmcnt(7) -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[4:5], s32 offset:16 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[4:5], s3 ; FLATSCR-NEXT: s_waitcnt vmcnt(7) -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[6:7], s32 offset:24 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[6:7], s4 ; FLATSCR-NEXT: s_waitcnt vmcnt(7) -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[8:9], s32 offset:32 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[8:9], s5 ; FLATSCR-NEXT: s_waitcnt vmcnt(7) -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[10:11], s32 offset:40 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[10:11], s6 ; FLATSCR-NEXT: s_waitcnt vmcnt(7) -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[12:13], s32 offset:48 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[12:13], s7 ; FLATSCR-NEXT: s_waitcnt vmcnt(7) -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[14:15], s32 offset:56 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[14:15], s8 ; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] ; FLATSCR-NEXT: s_endpgm %alloca = alloca [16 x i32], align 4, addrspace(5) @@ -277,17 +286,21 @@ ; FLATSCR-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] ; FLATSCR-NEXT: s_add_i32 s32, s32, 16 +; FLATSCR-NEXT: v_writelane_b32 v41, s0, 0 +; FLATSCR-NEXT: s_add_u32 s0, s32, 4 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 9 -; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:4 +; FLATSCR-NEXT: scratch_store_dword off, v0, s0 +; FLATSCR-NEXT: s_add_u32 s0, s32, 8 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 10 -; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:8 +; FLATSCR-NEXT: scratch_store_dword off, v0, s0 +; FLATSCR-NEXT: s_add_u32 s0, s32, 12 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 11 ; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 -; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:12 +; FLATSCR-NEXT: scratch_store_dword off, v0, s0 +; FLATSCR-NEXT: s_add_u32 s0, s32, 16 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 12 -; FLATSCR-NEXT: v_writelane_b32 v41, s0, 0 ; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 -; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:16 +; FLATSCR-NEXT: scratch_store_dword off, v0, s0 ; FLATSCR-NEXT: s_getpc_b64 s[0:1] ; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_v16i32_v16i32_v4i32@rel32@lo+4 ; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12 @@ -403,35 +416,42 @@ ; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] ; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off ; FLATSCR-NEXT: s_add_i32 s32, s32, 16 -; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 ; FLATSCR-NEXT: v_writelane_b32 v41, s0, 0 +; FLATSCR-NEXT: s_add_u32 s0, s32, 8 +; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 +; FLATSCR-NEXT: s_add_u32 s2, s32, 56 ; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 -; FLATSCR-NEXT: s_getpc_b64 s[0:1] -; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_byval@rel32@lo+4 -; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_byval@rel32@hi+12 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s32 ; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off offset:8 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s32 offset:8 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0 ; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off offset:16 +; FLATSCR-NEXT: s_add_u32 s0, s32, 16 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s32 offset:16 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0 ; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off offset:24 +; FLATSCR-NEXT: s_add_u32 s0, s32, 24 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s32 offset:24 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0 ; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off offset:32 +; FLATSCR-NEXT: s_add_u32 s0, s32, 32 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s32 offset:32 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0 ; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off offset:40 +; FLATSCR-NEXT: s_add_u32 s0, s32, 40 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s32 offset:40 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0 ; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off offset:48 +; FLATSCR-NEXT: s_add_u32 s0, s32, 48 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s32 offset:48 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0 ; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], v0, off offset:56 +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_byval@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_byval@rel32@hi+12 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s32 offset:56 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s2 ; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] ; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 ; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll @@ -353,14 +353,16 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-NEXT: scratch_load_ubyte v2, off, s2 offset:1 +; GFX10-NEXT: s_add_i32 s2, s2, 1 +; GFX10-NEXT: scratch_load_ubyte v2, off, s2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: flat_store_dword v[0:1], v2 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_scratch_load_i8_zext_s: ; GFX11: ; %bb.0: -; GFX11-NEXT: scratch_load_u8 v2, off, s0 offset:1 +; GFX11-NEXT: s_add_i32 s0, s0, 1 +; GFX11-NEXT: scratch_load_u8 v2, off, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -379,14 +381,16 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-NEXT: scratch_load_sbyte v2, off, s2 offset:1 +; GFX10-NEXT: s_add_i32 s2, s2, 1 +; GFX10-NEXT: scratch_load_sbyte v2, off, s2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: flat_store_dword v[0:1], v2 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_scratch_load_i8_sext_s: ; GFX11: ; %bb.0: -; GFX11-NEXT: scratch_load_i8 v2, off, s0 offset:1 +; GFX11-NEXT: s_add_i32 s0, s0, 1 +; GFX11-NEXT: scratch_load_i8 v2, off, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -405,14 +409,16 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-NEXT: scratch_load_ushort v2, off, s2 offset:2 +; GFX10-NEXT: s_add_i32 s2, s2, 2 +; GFX10-NEXT: scratch_load_ushort v2, off, s2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: flat_store_dword v[0:1], v2 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_scratch_load_i16_zext_s: ; GFX11: ; %bb.0: -; GFX11-NEXT: scratch_load_u16 v2, off, s0 offset:2 +; GFX11-NEXT: s_add_i32 s0, s0, 2 +; GFX11-NEXT: scratch_load_u16 v2, off, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -431,14 +437,16 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-NEXT: scratch_load_sshort v2, off, s2 offset:2 +; GFX10-NEXT: s_add_i32 s2, s2, 2 +; GFX10-NEXT: scratch_load_sshort v2, off, s2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: flat_store_dword v[0:1], v2 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_scratch_load_i16_sext_s: ; GFX11: ; %bb.0: -; GFX11-NEXT: scratch_load_i16 v2, off, s0 offset:2 +; GFX11-NEXT: s_add_i32 s0, s0, 2 +; GFX11-NEXT: scratch_load_i16 v2, off, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -458,7 +466,8 @@ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff0000 -; GFX10-NEXT: scratch_load_ubyte_d16 v2, off, s2 offset:1 +; GFX10-NEXT: s_add_i32 s2, s2, 1 +; GFX10-NEXT: scratch_load_ubyte_d16 v2, off, s2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: flat_store_dword v[0:1], v2 ; GFX10-NEXT: s_endpgm @@ -466,7 +475,8 @@ ; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_lo_s: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_mov_b32_e32 v2, 0xffff0000 -; GFX11-NEXT: scratch_load_d16_u8 v2, off, s0 offset:1 +; GFX11-NEXT: s_add_i32 s0, s0, 1 +; GFX11-NEXT: scratch_load_d16_u8 v2, off, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -488,7 +498,8 @@ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff0000 -; GFX10-NEXT: scratch_load_sbyte_d16 v2, off, s2 offset:1 +; GFX10-NEXT: s_add_i32 s2, s2, 1 +; GFX10-NEXT: scratch_load_sbyte_d16 v2, off, s2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: flat_store_dword v[0:1], v2 ; GFX10-NEXT: s_endpgm @@ -496,7 +507,8 @@ ; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_lo_s: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_mov_b32_e32 v2, 0xffff0000 -; GFX11-NEXT: scratch_load_d16_i8 v2, off, s0 offset:1 +; GFX11-NEXT: s_add_i32 s0, s0, 1 +; GFX11-NEXT: scratch_load_d16_i8 v2, off, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -518,7 +530,8 @@ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff0000 -; GFX10-NEXT: scratch_load_short_d16 v2, off, s2 offset:2 +; GFX10-NEXT: s_add_i32 s2, s2, 2 +; GFX10-NEXT: scratch_load_short_d16 v2, off, s2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: flat_store_dword v[0:1], v2 ; GFX10-NEXT: s_endpgm @@ -526,7 +539,8 @@ ; GFX11-LABEL: test_scratch_load_i16_to_d16_lo_s: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_mov_b32_e32 v2, 0xffff0000 -; GFX11-NEXT: scratch_load_d16_b16 v2, off, s0 offset:2 +; GFX11-NEXT: s_add_i32 s0, s0, 2 +; GFX11-NEXT: scratch_load_d16_b16 v2, off, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -548,7 +562,8 @@ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_mov_b32_e32 v2, -1 -; GFX10-NEXT: scratch_load_ubyte_d16_hi v2, off, s2 offset:1 +; GFX10-NEXT: s_add_i32 s2, s2, 1 +; GFX10-NEXT: scratch_load_ubyte_d16_hi v2, off, s2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: flat_store_dword v[0:1], v2 ; GFX10-NEXT: s_endpgm @@ -556,7 +571,8 @@ ; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_hi_s: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_mov_b32_e32 v2, -1 -; GFX11-NEXT: scratch_load_d16_hi_u8 v2, off, s0 offset:1 +; GFX11-NEXT: s_add_i32 s0, s0, 1 +; GFX11-NEXT: scratch_load_d16_hi_u8 v2, off, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -578,7 +594,8 @@ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_mov_b32_e32 v2, -1 -; GFX10-NEXT: scratch_load_sbyte_d16_hi v2, off, s2 offset:1 +; GFX10-NEXT: s_add_i32 s2, s2, 1 +; GFX10-NEXT: scratch_load_sbyte_d16_hi v2, off, s2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: flat_store_dword v[0:1], v2 ; GFX10-NEXT: s_endpgm @@ -586,7 +603,8 @@ ; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_hi_s: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_mov_b32_e32 v2, -1 -; GFX11-NEXT: scratch_load_d16_hi_i8 v2, off, s0 offset:1 +; GFX11-NEXT: s_add_i32 s0, s0, 1 +; GFX11-NEXT: scratch_load_d16_hi_i8 v2, off, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -608,7 +626,8 @@ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_mov_b32_e32 v2, -1 -; GFX10-NEXT: scratch_load_short_d16_hi v2, off, s2 offset:2 +; GFX10-NEXT: s_add_i32 s2, s2, 2 +; GFX10-NEXT: scratch_load_short_d16_hi v2, off, s2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: flat_store_dword v[0:1], v2 ; GFX10-NEXT: s_endpgm @@ -616,7 +635,8 @@ ; GFX11-LABEL: test_scratch_load_i16_to_d16_hi_s: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_mov_b32_e32 v2, -1 -; GFX11-NEXT: scratch_load_d16_hi_b16 v2, off, s0 offset:2 +; GFX11-NEXT: s_add_i32 s0, s0, 2 +; GFX11-NEXT: scratch_load_d16_hi_b16 v2, off, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -637,15 +657,17 @@ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: flat_load_dword v0, v[0:1] +; GFX10-NEXT: s_add_i32 s2, s2, 4 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: scratch_store_byte_d16_hi off, v0, s2 offset:4 +; GFX10-NEXT: scratch_store_byte_d16_hi off, v0, s2 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_scratch_store_b8_from_d16_hi_s: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: flat_load_b32 v0, v[0:1] +; GFX11-NEXT: s_add_i32 s0, s0, 4 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: scratch_store_d16_hi_b8 off, v0, s0 offset:4 +; GFX11-NEXT: scratch_store_d16_hi_b8 off, v0, s0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm bb: @@ -664,15 +686,17 @@ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: flat_load_dword v0, v[0:1] +; GFX10-NEXT: s_add_i32 s2, s2, 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: scratch_store_short_d16_hi off, v0, s2 offset:2 +; GFX10-NEXT: scratch_store_short_d16_hi off, v0, s2 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_scratch_store_b16_from_d16_hi_s: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: flat_load_b32 v0, v[0:1] +; GFX11-NEXT: s_add_i32 s0, s0, 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: scratch_store_d16_hi_b16 off, v0, s0 offset:2 +; GFX11-NEXT: scratch_store_d16_hi_b16 off, v0, s0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -13845,6 +13845,8 @@ ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s2, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -13883,8 +13885,7 @@ ; GFX11-NEXT: v_writelane_b32 v40, s25, 21 ; GFX11-NEXT: s_mov_b32 s24, s40 ; GFX11-NEXT: s_mov_b32 s25, s41 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b64 off, v[4:5], s32 offset:16 +; GFX11-NEXT: scratch_store_b64 off, v[4:5], s2 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 ; GFX11-NEXT: v_writelane_b32 v40, s26, 22 ; GFX11-NEXT: s_mov_b32 s26, s42 @@ -13952,6 +13953,7 @@ ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: s_add_i32 s2, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -13993,7 +13995,7 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s23, s39 ; GFX10-SCRATCH-NEXT: s_mov_b32 s24, s40 ; GFX10-SCRATCH-NEXT: s_mov_b32 s25, s41 -; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[4:5], s32 offset:16 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[4:5], s2 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s26, 22 ; GFX10-SCRATCH-NEXT: s_mov_b32 s26, s42 @@ -14321,6 +14323,7 @@ ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v32i32_i32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32_i32_inreg@rel32@hi+12 +; GFX11-NEXT: s_add_i32 s3, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s20, 16 ; GFX11-NEXT: v_writelane_b32 v40, s21, 17 ; GFX11-NEXT: v_writelane_b32 v40, s22, 18 @@ -14331,19 +14334,19 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, s46 :: v_dual_mov_b32 v3, s49 ; GFX11-NEXT: v_writelane_b32 v40, s24, 20 ; GFX11-NEXT: v_mov_b32_e32 v2, s48 +; GFX11-NEXT: s_add_i32 s2, s32, 24 ; GFX11-NEXT: s_mov_b32 s20, s36 ; GFX11-NEXT: s_mov_b32 s21, s37 -; GFX11-NEXT: s_mov_b32 s22, s38 ; GFX11-NEXT: v_writelane_b32 v40, s25, 21 +; GFX11-NEXT: s_mov_b32 s22, s38 ; GFX11-NEXT: s_mov_b32 s23, s39 ; GFX11-NEXT: s_mov_b32 s24, s40 ; GFX11-NEXT: s_mov_b32 s25, s41 -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: scratch_store_b32 off, v6, s32 offset:24 -; GFX11-NEXT: scratch_store_b64 off, v[4:5], s32 offset:16 -; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 ; GFX11-NEXT: v_writelane_b32 v40, s26, 22 ; GFX11-NEXT: s_mov_b32 s26, s42 +; GFX11-NEXT: scratch_store_b32 off, v6, s2 +; GFX11-NEXT: scratch_store_b64 off, v[4:5], s3 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 ; GFX11-NEXT: v_writelane_b32 v40, s27, 23 ; GFX11-NEXT: s_mov_b32 s27, s43 ; GFX11-NEXT: v_writelane_b32 v40, s28, 24 @@ -14433,11 +14436,13 @@ ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v32i32_i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32_i32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: s_add_i32 s3, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s20, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s21, 17 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s22, 18 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, s2 +; GFX10-SCRATCH-NEXT: s_add_i32 s2, s32, 24 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, s50 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s23, 19 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, s51 @@ -14453,8 +14458,8 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s23, s39 ; GFX10-SCRATCH-NEXT: s_mov_b32 s24, s40 ; GFX10-SCRATCH-NEXT: s_mov_b32 s25, s41 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v6, s32 offset:24 -; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[4:5], s32 offset:16 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v6, s2 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[4:5], s3 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s26, 22 ; GFX10-SCRATCH-NEXT: s_mov_b32 s26, s42 @@ -15115,15 +15120,16 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_dual_mov_b32 v0, 12 :: v_dual_mov_b32 v1, 13 -; GFX11-NEXT: v_dual_mov_b32 v2, 14 :: v_dual_mov_b32 v3, 15 -; GFX11-NEXT: v_dual_mov_b32 v4, 8 :: v_dual_mov_b32 v5, 9 -; GFX11-NEXT: v_dual_mov_b32 v6, 10 :: v_dual_mov_b32 v7, 11 +; GFX11-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 9 +; GFX11-NEXT: v_dual_mov_b32 v2, 10 :: v_dual_mov_b32 v3, 11 +; GFX11-NEXT: v_dual_mov_b32 v4, 12 :: v_dual_mov_b32 v5, 13 +; GFX11-NEXT: v_dual_mov_b32 v6, 14 :: v_dual_mov_b32 v7, 15 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: s_add_i32 s0, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16 -; GFX11-NEXT: scratch_store_b128 off, v[4:7], s32 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 +; GFX11-NEXT: scratch_store_b128 off, v[4:7], s0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0 ; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 1 @@ -15140,7 +15146,6 @@ ; GFX11-NEXT: v_dual_mov_b32 v26, 5 :: v_dual_mov_b32 v27, 5 ; GFX11-NEXT: v_dual_mov_b32 v28, 5 :: v_dual_mov_b32 v29, 5 ; GFX11-NEXT: v_dual_mov_b32 v30, 6 :: v_dual_mov_b32 v31, 7 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_8xv5i32@rel32@lo+4 @@ -15171,18 +15176,20 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 12 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 13 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 14 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 8 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 9 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 10 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 11 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 8 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 9 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 10 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 11 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 12 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 13 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 14 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 15 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s0, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 -; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[4:7], s32 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[4:7], s0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 @@ -15215,7 +15222,6 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v29, 5 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v30, 6 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v31, 7 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_8xv5i32@rel32@lo+4 @@ -15417,19 +15423,20 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, 0x41400000 -; GFX11-NEXT: v_mov_b32_e32 v1, 0x41500000 -; GFX11-NEXT: v_mov_b32_e32 v2, 0x41600000 -; GFX11-NEXT: v_mov_b32_e32 v3, 0x41700000 -; GFX11-NEXT: v_mov_b32_e32 v4, 0x41000000 -; GFX11-NEXT: v_mov_b32_e32 v5, 0x41100000 -; GFX11-NEXT: v_mov_b32_e32 v6, 0x41200000 -; GFX11-NEXT: v_mov_b32_e32 v7, 0x41300000 +; GFX11-NEXT: v_mov_b32_e32 v0, 0x41000000 +; GFX11-NEXT: v_mov_b32_e32 v1, 0x41100000 +; GFX11-NEXT: v_mov_b32_e32 v2, 0x41200000 +; GFX11-NEXT: v_mov_b32_e32 v3, 0x41300000 +; GFX11-NEXT: v_mov_b32_e32 v4, 0x41400000 +; GFX11-NEXT: v_mov_b32_e32 v5, 0x41500000 +; GFX11-NEXT: v_mov_b32_e32 v6, 0x41600000 +; GFX11-NEXT: v_mov_b32_e32 v7, 0x41700000 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: s_add_i32 s0, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16 -; GFX11-NEXT: scratch_store_b128 off, v[4:7], s32 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 +; GFX11-NEXT: scratch_store_b128 off, v[4:7], s0 ; GFX11-NEXT: v_mov_b32_e32 v6, 1.0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0 @@ -15448,7 +15455,6 @@ ; GFX11-NEXT: v_mov_b32_e32 v29, 0x40a00000 ; GFX11-NEXT: v_mov_b32_e32 v30, 0x40c00000 ; GFX11-NEXT: v_mov_b32_e32 v31, 0x40e00000 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_8xv5f32@rel32@lo+4 @@ -15479,18 +15485,20 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x41400000 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x41500000 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0x41600000 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0x41700000 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 0x41000000 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 0x41100000 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 0x41200000 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 0x41300000 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x41000000 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x41100000 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0x41200000 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0x41300000 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 0x41400000 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 0x41500000 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 0x41600000 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 0x41700000 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s0, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 -; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[4:7], s32 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[4:7], s0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 @@ -15523,7 +15531,6 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v29, 0x40a00000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v30, 0x40c00000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v31, 0x40e00000 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_8xv5f32@rel32@lo+4 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -1497,137 +1497,263 @@ ; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 ; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: s_clause 0x3e -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:2032 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:2016 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:2000 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1984 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1968 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1952 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1936 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1920 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1904 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1888 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1872 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1856 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1840 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1824 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1808 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1792 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1776 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1760 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1744 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1728 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1712 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1696 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1680 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1664 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1648 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1632 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1616 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1600 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1584 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1568 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1552 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1536 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1520 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1504 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1488 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1472 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1456 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1440 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1424 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1408 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1392 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1376 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1360 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1344 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1328 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1312 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1296 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1280 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1264 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1248 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1232 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1216 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1200 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1184 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1168 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1152 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1136 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1120 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1104 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1088 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1072 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1056 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1040 -; GFX11-NEXT: s_clause 0x3e -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1024 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1008 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:992 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:976 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:960 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:944 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:928 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:912 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:896 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:880 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:864 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:848 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:832 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:816 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:800 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:784 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:768 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:752 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:736 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:720 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:704 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:688 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:672 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:656 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:640 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:624 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:608 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:592 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:576 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:560 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:544 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:528 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:512 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:496 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:480 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:464 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:448 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:432 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:416 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:400 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:384 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:368 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:352 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:336 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:320 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:304 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:288 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:272 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:256 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:240 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:224 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:208 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:192 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:176 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:160 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:144 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:128 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:96 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:80 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:64 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:16 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_add_i32 s1, s0, 0x7f0 +; GFX11-NEXT: s_add_i32 s2, s0, 0x7e0 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0 +; GFX11-NEXT: s_add_i32 s3, s0, 0x7d0 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s3 +; GFX11-NEXT: s_add_i32 s1, s0, 0x7c0 +; GFX11-NEXT: s_add_i32 s2, s0, 0x7b0 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x7a0 +; GFX11-NEXT: s_add_i32 s2, s0, 0x790 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x780 +; GFX11-NEXT: s_add_i32 s2, s0, 0x770 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x760 +; GFX11-NEXT: s_add_i32 s2, s0, 0x750 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x740 +; GFX11-NEXT: s_add_i32 s2, s0, 0x730 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x720 +; GFX11-NEXT: s_add_i32 s2, s0, 0x710 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x700 +; GFX11-NEXT: s_add_i32 s2, s0, 0x6f0 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x6e0 +; GFX11-NEXT: s_add_i32 s2, s0, 0x6d0 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x6c0 +; GFX11-NEXT: s_add_i32 s2, s0, 0x6b0 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x6a0 +; GFX11-NEXT: s_add_i32 s2, s0, 0x690 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x680 +; GFX11-NEXT: s_add_i32 s2, s0, 0x670 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x660 +; GFX11-NEXT: s_add_i32 s2, s0, 0x650 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x640 +; GFX11-NEXT: s_add_i32 s2, s0, 0x630 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x620 +; GFX11-NEXT: s_add_i32 s2, s0, 0x610 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x600 +; GFX11-NEXT: s_add_i32 s2, s0, 0x5f0 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x5e0 +; GFX11-NEXT: s_add_i32 s2, s0, 0x5d0 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x5c0 +; GFX11-NEXT: s_add_i32 s2, s0, 0x5b0 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x5a0 +; GFX11-NEXT: s_add_i32 s2, s0, 0x590 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x580 +; GFX11-NEXT: s_add_i32 s2, s0, 0x570 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x560 +; GFX11-NEXT: s_add_i32 s2, s0, 0x550 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x540 +; GFX11-NEXT: s_add_i32 s2, s0, 0x530 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x520 +; GFX11-NEXT: s_add_i32 s2, s0, 0x510 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x500 +; GFX11-NEXT: s_add_i32 s2, s0, 0x4f0 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x4e0 +; GFX11-NEXT: s_add_i32 s2, s0, 0x4d0 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x4c0 +; GFX11-NEXT: s_add_i32 s2, s0, 0x4b0 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x4a0 +; GFX11-NEXT: s_add_i32 s2, s0, 0x490 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x480 +; GFX11-NEXT: s_add_i32 s2, s0, 0x470 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x460 +; GFX11-NEXT: s_add_i32 s2, s0, 0x450 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x440 +; GFX11-NEXT: s_add_i32 s2, s0, 0x430 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x420 +; GFX11-NEXT: s_add_i32 s2, s0, 0x410 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x400 +; GFX11-NEXT: s_add_i32 s2, s0, 0x3f0 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x3e0 +; GFX11-NEXT: s_add_i32 s2, s0, 0x3d0 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x3c0 +; GFX11-NEXT: s_add_i32 s2, s0, 0x3b0 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x3a0 +; GFX11-NEXT: s_add_i32 s2, s0, 0x390 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x380 +; GFX11-NEXT: s_add_i32 s2, s0, 0x370 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x360 +; GFX11-NEXT: s_add_i32 s2, s0, 0x350 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x340 +; GFX11-NEXT: s_add_i32 s2, s0, 0x330 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x320 +; GFX11-NEXT: s_add_i32 s2, s0, 0x310 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x300 +; GFX11-NEXT: s_add_i32 s2, s0, 0x2f0 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x2e0 +; GFX11-NEXT: s_add_i32 s2, s0, 0x2d0 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x2c0 +; GFX11-NEXT: s_add_i32 s2, s0, 0x2b0 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x2a0 +; GFX11-NEXT: s_add_i32 s2, s0, 0x290 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x280 +; GFX11-NEXT: s_add_i32 s2, s0, 0x270 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x260 +; GFX11-NEXT: s_add_i32 s2, s0, 0x250 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x240 +; GFX11-NEXT: s_add_i32 s2, s0, 0x230 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x220 +; GFX11-NEXT: s_add_i32 s2, s0, 0x210 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x200 +; GFX11-NEXT: s_add_i32 s2, s0, 0x1f0 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x1e0 +; GFX11-NEXT: s_add_i32 s2, s0, 0x1d0 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x1c0 +; GFX11-NEXT: s_add_i32 s2, s0, 0x1b0 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x1a0 +; GFX11-NEXT: s_add_i32 s2, s0, 0x190 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x180 +; GFX11-NEXT: s_add_i32 s2, s0, 0x170 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x160 +; GFX11-NEXT: s_add_i32 s2, s0, 0x150 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x140 +; GFX11-NEXT: s_add_i32 s2, s0, 0x130 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x120 +; GFX11-NEXT: s_add_i32 s2, s0, 0x110 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x100 +; GFX11-NEXT: s_add_i32 s2, s0, 0xf0 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0xe0 +; GFX11-NEXT: s_add_i32 s2, s0, 0xd0 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0xc0 +; GFX11-NEXT: s_add_i32 s2, s0, 0xb0 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0xa0 +; GFX11-NEXT: s_add_i32 s2, s0, 0x90 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x80 +; GFX11-NEXT: s_add_i32 s2, s0, 0x70 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 0x60 +; GFX11-NEXT: s_add_i32 s2, s0, 0x50 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 64 +; GFX11-NEXT: s_add_i32 s2, s0, 48 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 +; GFX11-NEXT: s_add_i32 s1, s0, 32 +; GFX11-NEXT: s_add_i32 s0, s0, 16 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: