diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1849,6 +1849,8 @@ CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32)); VAddr = SDValue(VMov, 0); SAddr = LHS; + if (!isFlatScratchBaseLegal(SAddr) || !isFlatScratchBaseLegal(VAddr)) + return false; if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset)) return false; Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16); @@ -1873,6 +1875,9 @@ return false; } + if (!isFlatScratchBaseLegal(SAddr) || !isFlatScratchBaseLegal(VAddr)) + return false; + if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset)) return false; SAddr = SelectSAddrFI(CurDAG, SAddr); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4103,6 +4103,9 @@ Register LHS = AddrDef->MI->getOperand(1).getReg(); auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI); + if (!isFlatScratchBaseLegal(LHS) || !isFlatScratchBaseLegal(RHS)) + return std::nullopt; + if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset)) return std::nullopt; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -50,9 +50,9 @@ ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_lshl_b32 s1, s0, 2 ; GFX940-NEXT: s_and_b32 s0, s0, 15 -; GFX940-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NEXT: s_add_i32 s1, s1, 4 ; GFX940-NEXT: s_lshl_b32 s0, s0, 2 -; GFX940-NEXT: scratch_store_dword v1, v0, off offset:4 sc0 sc1 +; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NEXT: scratch_load_dword v0, v0, off offset:4 sc0 sc1 @@ -62,17 +62,16 @@ ; GFX11-LABEL: store_load_sindex_kernel: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 15 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshl_b32 s1, s0, 2 -; GFX11-NEXT: s_and_b32 s0, s0, 15 -; GFX11-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-NEXT: s_and_b32 s1, s0, 15 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-NEXT: s_lshl_b32 s1, s1, 2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:4 dlc +; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_add_i32 s0, s0, 4 +; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:4 glc dlc +; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm bb: @@ -126,24 +125,28 @@ ; GFX940-LABEL: store_load_vindex_kernel: ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX940-NEXT: v_mov_b32_e32 v2, 15 +; GFX940-NEXT: v_mov_b32_e32 v3, 15 ; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0 -; GFX940-NEXT: scratch_store_dword v1, v2, off offset:4 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, 4 +; GFX940-NEXT: scratch_store_dword v1, v3, off offset:4 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-NEXT: scratch_load_dword v0, v0, off offset:128 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v1, 0x7c +; GFX940-NEXT: v_add3_u32 v0, v2, v0, v1 +; GFX940-NEXT: scratch_load_dword v0, v0, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_vindex_kernel: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_dual_mov_b32 v3, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 -; GFX11-NEXT: scratch_store_b32 v0, v2, off offset:4 dlc +; GFX11-NEXT: v_dual_mov_b32 v2, 0x7c :: v_dual_lshlrev_b32 v1, 2, v1 +; GFX11-NEXT: scratch_store_b32 v0, v3, off offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:128 glc dlc +; GFX11-NEXT: v_add3_u32 v1, 4, v1, v2 +; GFX11-NEXT: scratch_load_b32 v0, v1, off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm bb: @@ -194,9 +197,10 @@ ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX940-NEXT: v_add_u32_e32 v1, s32, v1 ; GFX940-NEXT: v_mov_b32_e32 v2, 15 ; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX940-NEXT: scratch_store_dword v1, v2, s32 sc0 sc1 +; GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: scratch_load_dword v0, v0, s32 sc0 sc1 @@ -207,13 +211,14 @@ ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 +; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v1, s32, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX11-NEXT: scratch_store_b32 v0, v2, s32 dlc +; GFX11-NEXT: scratch_store_b32 v1, v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v0, v1, s32 glc dlc +; GFX11-NEXT: scratch_load_b32 v0, v0, s32 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] bb: @@ -320,9 +325,9 @@ ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_lshl_b32 s1, s0, 2 ; GFX940-NEXT: s_and_b32 s0, s0, 15 -; GFX940-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NEXT: s_addk_i32 s1, 0x104 ; GFX940-NEXT: s_lshl_b32 s0, s0, 2 -; GFX940-NEXT: scratch_store_dword v1, v0, off offset:260 sc0 sc1 +; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NEXT: scratch_load_dword v0, v0, off offset:260 sc0 sc1 @@ -332,19 +337,17 @@ ; GFX11-LABEL: store_load_sindex_small_offset_kernel: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, 15 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshl_b32 s1, s0, 2 -; GFX11-NEXT: s_and_b32 s0, s0, 15 -; GFX11-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-NEXT: scratch_load_b32 v2, off, off offset:4 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s1, s0, 15 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-NEXT: s_lshl_b32 s1, s1, 2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:260 dlc +; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_addk_i32 s0, 0x104 +; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:260 glc dlc +; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:260 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm bb: @@ -408,26 +411,30 @@ ; GFX940-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX940-NEXT: v_mov_b32_e32 v2, 15 +; GFX940-NEXT: v_mov_b32_e32 v3, 15 ; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0 -; GFX940-NEXT: scratch_store_dword v1, v2, off offset:260 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, 0x104 +; GFX940-NEXT: scratch_store_dword v1, v3, off offset:260 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-NEXT: scratch_load_dword v0, v0, off offset:384 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v1, 0x7c +; GFX940-NEXT: v_add3_u32 v0, v2, v0, v1 +; GFX940-NEXT: scratch_load_dword v0, v0, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_vindex_small_offset_kernel: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 15 -; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc +; GFX11-NEXT: v_dual_mov_b32 v3, 15 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0x7c :: v_dual_lshlrev_b32 v1, 2, v1 +; GFX11-NEXT: v_add3_u32 v1, 0x104, v1, v2 +; GFX11-NEXT: scratch_load_b32 v2, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX11-NEXT: scratch_store_b32 v0, v2, off offset:260 dlc +; GFX11-NEXT: scratch_store_b32 v0, v3, off offset:260 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:384 glc dlc +; GFX11-NEXT: scratch_load_b32 v0, v1, off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm bb: @@ -491,9 +498,11 @@ ; GFX940-NEXT: scratch_load_dword v1, off, s32 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x100 +; GFX940-NEXT: v_add_u32_e32 v1, vcc_hi, v1 ; GFX940-NEXT: v_mov_b32_e32 v2, 15 ; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX940-NEXT: scratch_store_dword v1, v2, s32 offset:256 sc0 sc1 +; GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:256 sc0 sc1 @@ -504,14 +513,16 @@ ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x100 ; GFX11-NEXT: scratch_load_b32 v3, off, s32 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX11-NEXT: scratch_store_b32 v0, v2, s32 offset:256 dlc +; GFX11-NEXT: v_add_nc_u32_e32 v1, vcc_lo, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: scratch_store_b32 v1, v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v0, v1, s32 offset:256 glc dlc +; GFX11-NEXT: scratch_load_b32 v0, v0, s32 offset:256 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] bb: @@ -576,14 +587,13 @@ ; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, 15 -; GFX940-NEXT: s_movk_i32 vcc_lo, 0x4004 ; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4004 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_lshl_b32 s1, s0, 2 ; GFX940-NEXT: s_and_b32 s0, s0, 15 -; GFX940-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NEXT: s_addk_i32 s1, 0x4004 ; GFX940-NEXT: s_lshl_b32 s0, s0, 2 -; GFX940-NEXT: scratch_store_dword v1, v0, vcc_lo sc0 sc1 +; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NEXT: scratch_load_dword v0, v0, vcc_hi sc0 sc1 @@ -594,20 +604,17 @@ ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: s_movk_i32 vcc_lo, 0x4004 -; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, 15 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshl_b32 s1, s0, 2 -; GFX11-NEXT: s_and_b32 s0, s0, 15 -; GFX11-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-NEXT: scratch_load_b32 v2, off, off offset:4 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s1, s0, 15 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-NEXT: s_lshl_b32 s1, s1, 2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-NEXT: s_movk_i32 s0, 0x4004 -; GFX11-NEXT: scratch_store_b32 v0, v1, s0 dlc +; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_addk_i32 s0, 0x4004 +; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v0, v2, vcc_lo glc dlc +; GFX11-NEXT: scratch_load_b32 v0, v1, vcc_lo glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm bb: @@ -671,30 +678,32 @@ ; GFX940-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX940-NEXT: v_mov_b32_e32 v2, 15 -; GFX940-NEXT: s_movk_i32 vcc_lo, 0x4004 +; GFX940-NEXT: v_mov_b32_e32 v3, 15 +; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4004 ; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0 -; GFX940-NEXT: scratch_store_dword v1, v2, vcc_lo sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, 0x4004 +; GFX940-NEXT: scratch_store_dword v1, v3, vcc_hi sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4004 -; GFX940-NEXT: scratch_load_dword v0, v0, vcc_hi offset:124 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v1, 0x7c +; GFX940-NEXT: v_add3_u32 v0, v2, v0, v1 +; GFX940-NEXT: scratch_load_dword v0, v0, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_vindex_large_offset_kernel: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 15 -; GFX11-NEXT: s_movk_i32 s0, 0x4004 +; GFX11-NEXT: v_dual_mov_b32 v3, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_movk_i32 vcc_lo, 0x4004 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0x7c :: v_dual_lshlrev_b32 v1, 2, v1 +; GFX11-NEXT: v_add3_u32 v1, 0x4004, v1, v2 +; GFX11-NEXT: scratch_load_b32 v2, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b32 v0, v2, s0 dlc +; GFX11-NEXT: scratch_store_b32 v0, v3, vcc_lo dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v0, v1, vcc_lo offset:124 glc dlc +; GFX11-NEXT: scratch_load_b32 v0, v1, off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm bb: @@ -758,10 +767,11 @@ ; GFX940-NEXT: scratch_load_dword v1, off, s32 offset:4 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX940-NEXT: v_mov_b32_e32 v2, 15 ; GFX940-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX940-NEXT: v_add_u32_e32 v1, vcc_lo, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, 15 ; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX940-NEXT: scratch_store_dword v1, v2, vcc_lo sc0 sc1 +; GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4004 @@ -773,17 +783,18 @@ ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX11-NEXT: s_add_i32 s0, s32, 0x4004 ; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4004 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v1, s0, v1 ; GFX11-NEXT: scratch_load_b32 v3, off, s32 offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b32 v0, v2, s0 dlc +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: scratch_store_b32 v1, v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v0, v1, vcc_lo glc dlc +; GFX11-NEXT: scratch_load_b32 v0, v0, vcc_lo glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] bb: @@ -981,24 +992,30 @@ ; GFX940-LABEL: store_load_vidx_sidx_offset: ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-NEXT: v_mov_b32_e32 v1, 15 +; GFX940-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-NEXT: v_mov_b32_e32 v2, 0x400 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_add_lshl_u32 v0, s0, v0, 2 -; GFX940-NEXT: scratch_store_dword v0, v1, off offset:1028 sc0 sc1 +; GFX940-NEXT: v_add3_u32 v0, v1, v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, 15 +; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: scratch_load_dword v0, v0, off offset:1028 sc0 sc1 +; GFX940-NEXT: scratch_load_dword v0, v0, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_vidx_sidx_offset: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 15 +; GFX11-NEXT: v_mov_b32_e32 v1, 0x400 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_lshl_u32 v0, s0, v0, 2 -; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:1028 dlc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v0, 4, v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, 15 +; GFX11-NEXT: scratch_store_b32 v0, v1, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v0, v0, off offset:1028 glc dlc +; GFX11-NEXT: scratch_load_b32 v0, v0, off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll @@ -747,7 +747,9 @@ ; GFX11-LABEL: test_scratch_load_i8_zext_svs: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: scratch_load_u8 v0, v0, s0 offset:1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v0, s0, v0, 1 +; GFX11-NEXT: scratch_load_u8 v0, v0, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -778,7 +780,9 @@ ; GFX11-LABEL: test_scratch_load_i8_sext_svs: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: scratch_load_i8 v0, v0, s0 offset:1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v0, s0, v0, 1 +; GFX11-NEXT: scratch_load_i8 v0, v0, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -809,7 +813,9 @@ ; GFX11-LABEL: test_scratch_load_i16_zext_svs: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: scratch_load_u16 v0, v0, s0 offset:2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v0, s0, v0, 2 +; GFX11-NEXT: scratch_load_u16 v0, v0, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -840,7 +846,9 @@ ; GFX11-LABEL: test_scratch_load_i16_sext_svs: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: scratch_load_i16 v0, v0, s0 offset:2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v0, s0, v0, 2 +; GFX11-NEXT: scratch_load_i16 v0, v0, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -872,7 +880,9 @@ ; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_lo_svs: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: scratch_load_d16_u8 v3, v0, s0 offset:1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v0, s0, v0, 1 +; GFX11-NEXT: scratch_load_d16_u8 v3, v0, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v3 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -906,7 +916,9 @@ ; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_lo_svs: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: scratch_load_d16_i8 v3, v0, s0 offset:1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v0, s0, v0, 1 +; GFX11-NEXT: scratch_load_d16_i8 v3, v0, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v3 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -940,7 +952,9 @@ ; GFX11-LABEL: test_scratch_load_i16_to_d16_lo_svs: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: scratch_load_d16_b16 v3, v0, s0 offset:2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v0, s0, v0, 2 +; GFX11-NEXT: scratch_load_d16_b16 v3, v0, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v3 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -974,7 +988,9 @@ ; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_hi_svs: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: scratch_load_d16_hi_u8 v3, v0, s0 offset:1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v0, s0, v0, 1 +; GFX11-NEXT: scratch_load_d16_hi_u8 v3, v0, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v3 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1008,7 +1024,9 @@ ; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_hi_svs: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: scratch_load_d16_hi_i8 v3, v0, s0 offset:1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v0, s0, v0, 1 +; GFX11-NEXT: scratch_load_d16_hi_i8 v3, v0, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v3 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1042,7 +1060,9 @@ ; GFX11-LABEL: test_scratch_load_i16_to_d16_hi_svs: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: scratch_load_d16_hi_b16 v3, v0, s0 offset:2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v0, s0, v0, 2 +; GFX11-NEXT: scratch_load_d16_hi_b16 v3, v0, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v3 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1075,8 +1095,10 @@ ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: flat_load_b32 v0, v[0:1] ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, s0, v1, 4 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: scratch_store_d16_hi_b8 v1, v0, s0 offset:4 +; GFX11-NEXT: scratch_store_d16_hi_b8 v1, v0, off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm bb: @@ -1107,8 +1129,10 @@ ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: flat_load_b32 v0, v[0:1] ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, s0, v1, 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: scratch_store_d16_hi_b16 v1, v0, s0 offset:2 +; GFX11-NEXT: scratch_store_d16_hi_b16 v1, v0, off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll @@ -13,16 +13,21 @@ ; GFX940-SDAG-LABEL: soff1_voff1: ; GFX940-SDAG: ; %bb.0: ; %bb ; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, s0 offset:1 sc0 sc1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v0, v1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 +; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: scratch_store_byte v0, v2, s0 offset:2 sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, s0 offset:4 sc0 sc1 +; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: s_endpgm ; @@ -103,17 +108,21 @@ ; GFX940-SDAG-LABEL: soff1_voff2: ; GFX940-SDAG: ; %bb.0: ; %bb ; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, s0 offset:1 sc0 sc1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 +; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 1, v1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 +; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: scratch_store_byte v0, v2, s0 offset:2 sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, s0 offset:4 sc0 sc1 +; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: s_endpgm ; @@ -197,17 +206,21 @@ ; GFX940-SDAG-LABEL: soff1_voff4: ; GFX940-SDAG: ; %bb.0: ; %bb ; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, s0 offset:1 sc0 sc1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 +; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 +; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: scratch_store_byte v0, v2, s0 offset:2 sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, s0 offset:4 sc0 sc1 +; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: s_endpgm ; @@ -238,12 +251,16 @@ ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 +; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v5, v2, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -287,17 +304,22 @@ ; GFX940-SDAG-LABEL: soff2_voff1: ; GFX940-SDAG: ; %bb.0: ; %bb ; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX940-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, s0 offset:1 sc0 sc1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v0, v1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 +; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: scratch_store_byte v0, v2, s0 offset:2 sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, s0 offset:4 sc0 sc1 +; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: s_endpgm ; @@ -381,18 +403,22 @@ ; GFX940-SDAG-LABEL: soff2_voff2: ; GFX940-SDAG: ; %bb.0: ; %bb ; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX940-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, s0 offset:1 sc0 sc1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 +; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 1, v1 +; GFX940-SDAG-NEXT: v_or_b32_e32 v1, 1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 +; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: scratch_store_byte v0, v2, s0 offset:2 sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, s0 offset:4 sc0 sc1 +; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: s_endpgm ; @@ -481,18 +507,22 @@ ; GFX940-SDAG-LABEL: soff2_voff4: ; GFX940-SDAG: ; %bb.0: ; %bb ; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX940-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, s0 offset:1 sc0 sc1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 +; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX940-SDAG-NEXT: v_or_b32_e32 v1, 1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 +; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: scratch_store_byte v0, v2, s0 offset:2 sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, s0 offset:4 sc0 sc1 +; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: s_endpgm ; @@ -521,17 +551,21 @@ ; GFX11-SDAG-LABEL: soff2_voff4: ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_or_b32_e32 v4, 1, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 +; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v5, v2, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -577,17 +611,22 @@ ; GFX940-SDAG-LABEL: soff4_voff1: ; GFX940-SDAG: ; %bb.0: ; %bb ; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX940-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, s0 offset:1 sc0 sc1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v0, v1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 +; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: scratch_store_byte v0, v2, s0 offset:2 sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, s0 offset:4 sc0 sc1 +; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: s_endpgm ; @@ -615,20 +654,19 @@ ; GFX11-SDAG-LABEL: soff4_voff1: ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 1 :: v_dual_mov_b32 v3, 2 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v5, 4 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add3_u32 v1, 4, s0, v0 -; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 1, v1 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v1, 2, v1 -; GFX11-SDAG-NEXT: scratch_store_b8 v4, v2, off dlc +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 +; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v1, v3, off dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v5, v2, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v5, s0 offset:4 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -672,18 +710,22 @@ ; GFX940-SDAG-LABEL: soff4_voff2: ; GFX940-SDAG: ; %bb.0: ; %bb ; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX940-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, s0 offset:1 sc0 sc1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 +; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 1, v1 +; GFX940-SDAG-NEXT: v_or_b32_e32 v1, 1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 +; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: scratch_store_byte v0, v2, s0 offset:2 sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, s0 offset:4 sc0 sc1 +; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: s_endpgm ; @@ -712,20 +754,21 @@ ; GFX11-SDAG-LABEL: soff4_voff2: ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 2 :: v_dual_lshlrev_b32 v0, 1, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 1 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 4 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add_nc_u32_e64 v1, s0, 4 -; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX11-SDAG-NEXT: v_add3_u32 v1, v1, v0, 2 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:1 dlc +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_or_b32_e32 v4, 1, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 +; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v1, v3, off dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v5, v2, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v4, s0 offset:4 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -771,18 +814,22 @@ ; GFX940-SDAG-LABEL: soff4_voff4: ; GFX940-SDAG: ; %bb.0: ; %bb ; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX940-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, s0 offset:1 sc0 sc1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 +; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX940-SDAG-NEXT: v_or_b32_e32 v1, 1, v0 +; GFX940-SDAG-NEXT: v_or_b32_e32 v3, 2, v0 +; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: scratch_store_byte v0, v2, s0 offset:2 sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, s0 offset:4 sc0 sc1 +; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: s_endpgm ; @@ -811,17 +858,21 @@ ; GFX11-SDAG-LABEL: soff4_voff4: ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_or_b32_e32 v4, 1, v0 +; GFX11-SDAG-NEXT: v_or_b32_e32 v5, 2, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 +; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v5, v2, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -796,10 +796,10 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s32 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX11-NEXT: scratch_store_b32 v0, v2, s32 dlc +; GFX11-NEXT: scratch_store_b32 v0, v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v1, s32 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -822,10 +822,11 @@ ; GFX940-LABEL: store_load_vindex_foo: ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX940-NEXT: v_mov_b32_e32 v1, s32 +; GFX940-NEXT: v_lshl_add_u32 v1, v0, 2, v1 ; GFX940-NEXT: v_mov_b32_e32 v2, 15 ; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX940-NEXT: scratch_store_dword v1, v2, s32 sc0 sc1 +; GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: scratch_load_dword v0, v0, s32 sc0 sc1 @@ -851,10 +852,10 @@ ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 -; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s32 ; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, s32 dlc +; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, off dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s32 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) @@ -1891,11 +1892,12 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x100 ; GFX11-NEXT: scratch_load_b32 v3, off, s32 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, vcc_lo ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX11-NEXT: scratch_store_b32 v0, v2, s32 offset:256 dlc +; GFX11-NEXT: scratch_store_b32 v0, v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v1, s32 offset:256 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1923,10 +1925,12 @@ ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: scratch_load_dword v1, off, s32 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x100 +; GFX940-NEXT: v_mov_b32_e32 v1, vcc_hi +; GFX940-NEXT: v_lshl_add_u32 v1, v0, 2, v1 ; GFX940-NEXT: v_mov_b32_e32 v2, 15 ; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX940-NEXT: scratch_store_dword v1, v2, s32 offset:256 sc0 sc1 +; GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:256 sc0 sc1 @@ -1956,11 +1960,12 @@ ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 -; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, s32 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, vcc_lo ; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, s32 offset:256 dlc +; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, off dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:256 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) @@ -2960,14 +2965,14 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_add_i32 s0, s32, 0x4004 ; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX11-NEXT: scratch_load_b32 v3, off, s32 offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b32 v0, v2, s0 dlc +; GFX11-NEXT: scratch_store_b32 v0, v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v1, vcc_lo glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2995,11 +3000,12 @@ ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: scratch_load_dword v1, off, s32 offset:4 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX940-NEXT: v_mov_b32_e32 v2, 15 ; GFX940-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX940-NEXT: v_mov_b32_e32 v1, vcc_lo +; GFX940-NEXT: v_lshl_add_u32 v1, v0, 2, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, 15 ; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX940-NEXT: scratch_store_dword v1, v2, vcc_lo sc0 sc1 +; GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4004 @@ -3030,14 +3036,14 @@ ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 -; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-PAL-NEXT: s_add_i32 s0, s32, 0x4004 ; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX11-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, s32 offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, s0 dlc +; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, off dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, vcc_lo glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) @@ -3109,13 +3115,15 @@ ; ; GFX11-LABEL: store_load_large_imm_offset_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000 -; GFX11-NEXT: v_mov_b32_e32 v2, 15 +; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 +; GFX11-NEXT: s_movk_i32 s0, 0x3000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s0, s0, 4 ; GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_store_b32 v1, v2, off offset:3716 dlc +; GFX11-NEXT: scratch_store_b32 off, v1, s0 offset:3712 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:3716 glc dlc +; GFX11-NEXT: scratch_load_b32 v0, off, s0 offset:3712 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm ; @@ -3144,13 +3152,14 @@ ; GFX940-LABEL: store_load_large_imm_offset_kernel: ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: v_mov_b32_e32 v0, 13 +; GFX940-NEXT: s_movk_i32 s0, 0x3000 ; GFX940-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, 0x3000 -; GFX940-NEXT: v_mov_b32_e32 v1, 15 -; GFX940-NEXT: scratch_store_dword v0, v1, off offset:3716 sc0 sc1 +; GFX940-NEXT: s_add_i32 s0, s0, 4 +; GFX940-NEXT: v_mov_b32_e32 v0, 15 +; GFX940-NEXT: scratch_store_dword off, v0, s0 offset:3712 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: scratch_load_dword v0, v0, off offset:3716 sc0 sc1 +; GFX940-NEXT: scratch_load_dword v0, off, s0 offset:3712 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm ; @@ -3203,13 +3212,15 @@ ; ; GFX11-PAL-LABEL: store_load_large_imm_offset_kernel: ; GFX11-PAL: ; %bb.0: ; %bb -; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000 -; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15 +; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 +; GFX11-PAL-NEXT: s_movk_i32 s0, 0x3000 +; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-PAL-NEXT: s_add_i32 s0, s0, 4 ; GFX11-PAL-NEXT: scratch_store_b32 off, v0, off offset:4 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: scratch_store_b32 v1, v2, off offset:3716 dlc +; GFX11-PAL-NEXT: scratch_store_b32 off, v1, s0 offset:3712 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, off offset:3716 glc dlc +; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s0 offset:3712 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_endpgm bb: @@ -3261,13 +3272,16 @@ ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000 -; GFX11-NEXT: v_mov_b32_e32 v2, 15 +; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 +; GFX11-NEXT: s_movk_i32 s0, 0x3000 +; GFX11-NEXT: s_add_i32 vcc_lo, s32, 4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s0, s0, vcc_lo ; GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_store_b32 v1, v2, s32 offset:3716 dlc +; GFX11-NEXT: scratch_store_b32 off, v1, s0 offset:3712 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v0, v1, s32 offset:3716 glc dlc +; GFX11-NEXT: scratch_load_b32 v0, off, s0 offset:3712 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3291,13 +3305,15 @@ ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, 13 +; GFX940-NEXT: s_movk_i32 s0, 0x3000 +; GFX940-NEXT: s_add_i32 vcc_hi, s32, 4 ; GFX940-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, 0x3000 -; GFX940-NEXT: v_mov_b32_e32 v1, 15 -; GFX940-NEXT: scratch_store_dword v0, v1, s32 offset:3716 sc0 sc1 +; GFX940-NEXT: s_add_i32 s0, s0, vcc_hi +; GFX940-NEXT: v_mov_b32_e32 v0, 15 +; GFX940-NEXT: scratch_store_dword off, v0, s0 offset:3712 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:3716 sc0 sc1 +; GFX940-NEXT: scratch_load_dword v0, off, s0 offset:3712 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -3322,13 +3338,16 @@ ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000 -; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15 +; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 +; GFX11-PAL-NEXT: s_movk_i32 s0, 0x3000 +; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 4 +; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-PAL-NEXT: s_add_i32 s0, s0, vcc_lo ; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: scratch_store_b32 v1, v2, s32 offset:3716 dlc +; GFX11-PAL-NEXT: scratch_store_b32 off, v1, s0 offset:3712 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:3716 glc dlc +; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s0 offset:3712 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] ; GCN-LABEL: store_load_large_imm_offset_foo: @@ -3394,12 +3413,15 @@ ; GFX11-LABEL: store_load_vidx_sidx_offset: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 15 +; GFX11-NEXT: v_mov_b32_e32 v1, 4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_lshl_u32 v0, s0, v0, 2 -; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:1028 dlc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v0, v1, v0, 0x400 +; GFX11-NEXT: v_mov_b32_e32 v1, 15 +; GFX11-NEXT: scratch_store_b32 v0, v1, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v0, v0, off offset:1028 glc dlc +; GFX11-NEXT: scratch_load_b32 v0, v0, off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm ; @@ -3427,12 +3449,15 @@ ; GFX940-LABEL: store_load_vidx_sidx_offset: ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-NEXT: v_mov_b32_e32 v1, 15 +; GFX940-NEXT: v_mov_b32_e32 v1, 4 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_add_lshl_u32 v0, s0, v0, 2 -; GFX940-NEXT: scratch_store_dword v0, v1, off offset:1028 sc0 sc1 +; GFX940-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX940-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX940-NEXT: v_add_u32_e32 v0, 0x400, v0 +; GFX940-NEXT: v_mov_b32_e32 v1, 15 +; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: scratch_load_dword v0, v0, off offset:1028 sc0 sc1 +; GFX940-NEXT: scratch_load_dword v0, v0, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm ; @@ -3462,12 +3487,15 @@ ; GFX11-PAL-LABEL: store_load_vidx_sidx_offset: ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 4 ; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PAL-NEXT: v_add_lshl_u32 v0, s0, v0, 2 -; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:1028 dlc +; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-PAL-NEXT: v_add3_u32 v0, v1, v0, 0x400 +; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: scratch_load_b32 v0, v0, off offset:1028 glc dlc +; GFX11-PAL-NEXT: scratch_load_b32 v0, v0, off glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_endpgm ; GCN-LABEL: store_load_vidx_sidx_offset: @@ -4029,11 +4057,11 @@ ; GFX940-LABEL: store_load_i32_large_negative_unaligned: ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xef7f +; GFX940-NEXT: v_add_u32_e32 v0, 0xffffef7f, v0 ; GFX940-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-NEXT: scratch_store_byte v0, v1, s0 sc0 sc1 +; GFX940-NEXT: scratch_store_byte v0, v1, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: scratch_load_ubyte v0, v0, s0 sc0 sc1 +; GFX940-NEXT: scratch_load_ubyte v0, v0, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll @@ -310,10 +310,10 @@ ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: scratch_load_dword v0, v0, s4 nt +; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4 +; GFX940-NOTTGSPLIT-NEXT: scratch_load_dword v0, v0, off nt ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm @@ -322,21 +322,23 @@ ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: scratch_load_dword v0, v0, s4 nt +; GFX940-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4 +; GFX940-TGSPLIT-NEXT: scratch_load_dword v0, v0, off nt ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_nontemporal_load_1: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: scratch_load_b32 v0, v0, s2 slc dlc +; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX11-WGP-NEXT: scratch_load_b32 v0, v0, off slc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -344,11 +346,13 @@ ; ; GFX11-CU-LABEL: private_nontemporal_load_1: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: scratch_load_b32 v0, v0, s2 slc dlc +; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX11-CU-NEXT: scratch_load_b32 v0, v0, off slc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -657,24 +661,24 @@ ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-NOTTGSPLIT-NEXT: scratch_store_dword v0, v1, s4 nt +; GFX940-NOTTGSPLIT-NEXT: scratch_store_dword v0, v1, off nt ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: private_nontemporal_store_1: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-TGSPLIT-NEXT: scratch_store_dword v0, v1, s4 nt +; GFX940-TGSPLIT-NEXT: scratch_store_dword v0, v1, off nt ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_nontemporal_store_1: @@ -684,9 +688,10 @@ ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-WGP-NEXT: scratch_store_b32 v0, v1, s0 glc slc dlc +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 v0, v1, off glc slc dlc ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -697,9 +702,10 @@ ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-CU-NEXT: scratch_store_b32 v0, v1, s0 glc slc dlc +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 v0, v1, off glc slc dlc ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(5) %out) { diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll @@ -222,11 +222,13 @@ ; ; GFX11-WGP-LABEL: private_volatile_load_1: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: scratch_load_b32 v0, v0, s2 glc dlc +; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX11-WGP-NEXT: scratch_load_b32 v0, v0, off glc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -234,11 +236,13 @@ ; ; GFX11-CU-LABEL: private_volatile_load_1: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: scratch_load_b32 v0, v0, s2 glc dlc +; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX11-CU-NEXT: scratch_load_b32 v0, v0, off glc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -480,9 +484,10 @@ ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-WGP-NEXT: scratch_store_b32 v0, v1, s0 dlc +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 v0, v1, off dlc ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -494,9 +499,10 @@ ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-CU-NEXT: scratch_store_b32 v0, v1, s0 dlc +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 v0, v1, off dlc ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm