Index: llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1880,6 +1880,11 @@ return false; } + KnownBits SAddrKnown = CurDAG->computeKnownBits(SAddr); + KnownBits VAddrKnown = CurDAG->computeKnownBits(VAddr); + if (!SAddrKnown.isNonNegative() || !VAddrKnown.isNonNegative()) + return false; + if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset)) return false; SAddr = SelectSAddrFI(CurDAG, SAddr); Index: llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll +++ llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll @@ -13,16 +13,21 @@ ; GFX940-SDAG-LABEL: soff1_voff1: ; GFX940-SDAG: ; %bb.0: ; %bb ; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, s0 offset:1 sc0 sc1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v0, v1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 +; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: scratch_store_byte v0, v2, s0 offset:2 sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, s0 offset:4 sc0 sc1 +; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: s_endpgm ; @@ -97,17 +102,21 @@ ; GFX940-SDAG-LABEL: soff1_voff2: ; GFX940-SDAG: ; %bb.0: ; %bb ; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, s0 offset:1 sc0 sc1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 +; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 1, v1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 +; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: scratch_store_byte v0, v2, s0 offset:2 sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, s0 offset:4 sc0 sc1 +; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: s_endpgm ; @@ -185,17 +194,21 @@ ; GFX940-SDAG-LABEL: soff1_voff4: ; GFX940-SDAG: ; %bb.0: ; %bb ; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, s0 offset:1 sc0 sc1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 +; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 +; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: scratch_store_byte v0, v2, s0 offset:2 sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, s0 offset:4 sc0 sc1 +; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: s_endpgm ; @@ -223,12 +236,16 @@ ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 +; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v5, v2, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -269,17 +286,22 @@ ; GFX940-SDAG-LABEL: soff2_voff1: ; GFX940-SDAG: ; %bb.0: ; %bb ; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX940-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, s0 offset:1 sc0 sc1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v0, v1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 +; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: scratch_store_byte v0, v2, s0 offset:2 sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, s0 offset:4 sc0 sc1 +; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: s_endpgm ; @@ -358,18 +380,22 @@ ; GFX940-SDAG-LABEL: soff2_voff2: ; GFX940-SDAG: ; %bb.0: ; %bb ; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX940-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, s0 offset:1 sc0 sc1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 +; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 1, v1 +; GFX940-SDAG-NEXT: v_or_b32_e32 v1, 1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 +; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: scratch_store_byte v0, v2, s0 offset:2 sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, s0 offset:4 sc0 sc1 +; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: s_endpgm ; @@ -451,18 +477,22 @@ ; GFX940-SDAG-LABEL: soff2_voff4: ; GFX940-SDAG: ; %bb.0: ; %bb ; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX940-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, s0 offset:1 sc0 sc1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 +; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX940-SDAG-NEXT: v_or_b32_e32 v1, 1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 +; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: scratch_store_byte v0, v2, s0 offset:2 sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, s0 offset:4 sc0 sc1 +; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: s_endpgm ; @@ -488,17 +518,21 @@ ; GFX11-SDAG-LABEL: soff2_voff4: ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_or_b32_e32 v4, 1, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 +; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v5, v2, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -540,17 +574,22 @@ ; GFX940-SDAG-LABEL: soff4_voff1: ; GFX940-SDAG: ; %bb.0: ; %bb ; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX940-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, s0 offset:1 sc0 sc1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v0, v1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 +; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: scratch_store_byte v0, v2, s0 offset:2 sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, s0 offset:4 sc0 sc1 +; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: s_endpgm ; @@ -575,20 +614,19 @@ ; GFX11-SDAG-LABEL: soff4_voff1: ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 1 :: v_dual_mov_b32 v3, 2 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v5, 4 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add3_u32 v1, 4, s0, v0 -; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 1, v1 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v1, 2, v1 -; GFX11-SDAG-NEXT: scratch_store_b8 v4, v2, off dlc +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 +; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v1, v3, off dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v5, v2, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v5, s0 offset:4 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -630,18 +668,22 @@ ; GFX940-SDAG-LABEL: soff4_voff2: ; GFX940-SDAG: ; %bb.0: ; %bb ; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX940-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, s0 offset:1 sc0 sc1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 +; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 1, v1 +; GFX940-SDAG-NEXT: v_or_b32_e32 v1, 1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 +; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: scratch_store_byte v0, v2, s0 offset:2 sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, s0 offset:4 sc0 sc1 +; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: s_endpgm ; @@ -667,20 +709,21 @@ ; GFX11-SDAG-LABEL: soff4_voff2: ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 2 :: v_dual_lshlrev_b32 v0, 1, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 1 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 4 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add_nc_u32_e64 v1, s0, 4 -; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX11-SDAG-NEXT: v_add3_u32 v1, v1, v0, 2 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:1 dlc +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_or_b32_e32 v4, 1, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 +; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v1, v3, off dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v5, v2, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v4, s0 offset:4 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -722,18 +765,22 @@ ; GFX940-SDAG-LABEL: soff4_voff4: ; GFX940-SDAG: ; %bb.0: ; %bb ; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX940-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, s0 offset:1 sc0 sc1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 +; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX940-SDAG-NEXT: v_or_b32_e32 v1, 1, v0 +; GFX940-SDAG-NEXT: v_or_b32_e32 v3, 2, v0 +; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: scratch_store_byte v0, v2, s0 offset:2 sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, s0 offset:4 sc0 sc1 +; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: s_endpgm ; @@ -759,17 +806,21 @@ ; GFX11-SDAG-LABEL: soff4_voff4: ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_or_b32_e32 v4, 1, v0 +; GFX11-SDAG-NEXT: v_or_b32_e32 v5, 2, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 +; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v5, v2, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm Index: llvm/test/CodeGen/AMDGPU/flat-scratch.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -796,10 +796,10 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s32 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX11-NEXT: scratch_store_b32 v0, v2, s32 dlc +; GFX11-NEXT: scratch_store_b32 v0, v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v1, s32 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -822,10 +822,11 @@ ; GFX940-LABEL: store_load_vindex_foo: ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX940-NEXT: v_mov_b32_e32 v1, s32 +; GFX940-NEXT: v_lshl_add_u32 v1, v0, 2, v1 ; GFX940-NEXT: v_mov_b32_e32 v2, 15 ; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX940-NEXT: scratch_store_dword v1, v2, s32 sc0 sc1 +; GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: scratch_load_dword v0, v0, s32 sc0 sc1 @@ -851,10 +852,10 @@ ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 -; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s32 ; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, s32 dlc +; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, off dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s32 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) @@ -1891,11 +1892,12 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x100 ; GFX11-NEXT: scratch_load_b32 v3, off, s32 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, vcc_lo ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX11-NEXT: scratch_store_b32 v0, v2, s32 offset:256 dlc +; GFX11-NEXT: scratch_store_b32 v0, v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v1, s32 offset:256 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1923,10 +1925,12 @@ ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: scratch_load_dword v1, off, s32 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x100 +; GFX940-NEXT: v_mov_b32_e32 v1, vcc_hi +; GFX940-NEXT: v_lshl_add_u32 v1, v0, 2, v1 ; GFX940-NEXT: v_mov_b32_e32 v2, 15 ; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX940-NEXT: scratch_store_dword v1, v2, s32 offset:256 sc0 sc1 +; GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:256 sc0 sc1 @@ -1956,11 +1960,12 @@ ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 -; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, s32 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, vcc_lo ; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, s32 offset:256 dlc +; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, off dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:256 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) @@ -2960,14 +2965,14 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_add_i32 s0, s32, 0x4004 ; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX11-NEXT: scratch_load_b32 v3, off, s32 offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b32 v0, v2, s0 dlc +; GFX11-NEXT: scratch_store_b32 v0, v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v1, vcc_lo glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2995,11 +3000,12 @@ ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: scratch_load_dword v1, off, s32 offset:4 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX940-NEXT: v_mov_b32_e32 v2, 15 ; GFX940-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX940-NEXT: v_mov_b32_e32 v1, vcc_lo +; GFX940-NEXT: v_lshl_add_u32 v1, v0, 2, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, 15 ; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX940-NEXT: scratch_store_dword v1, v2, vcc_lo sc0 sc1 +; GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4004 @@ -3030,14 +3036,14 @@ ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 -; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-PAL-NEXT: s_add_i32 s0, s32, 0x4004 ; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX11-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, s32 offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, s0 dlc +; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, off dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, vcc_lo glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) @@ -3394,12 +3400,15 @@ ; GFX11-LABEL: store_load_vidx_sidx_offset: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 15 +; GFX11-NEXT: v_mov_b32_e32 v1, 4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_lshl_u32 v0, s0, v0, 2 -; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:1028 dlc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v0, v1, v0, 0x400 +; GFX11-NEXT: v_mov_b32_e32 v1, 15 +; GFX11-NEXT: scratch_store_b32 v0, v1, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v0, v0, off offset:1028 glc dlc +; GFX11-NEXT: scratch_load_b32 v0, v0, off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm ; @@ -3427,12 +3436,15 @@ ; GFX940-LABEL: store_load_vidx_sidx_offset: ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-NEXT: v_mov_b32_e32 v1, 15 +; GFX940-NEXT: v_mov_b32_e32 v1, 4 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_add_lshl_u32 v0, s0, v0, 2 -; GFX940-NEXT: scratch_store_dword v0, v1, off offset:1028 sc0 sc1 +; GFX940-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX940-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX940-NEXT: v_add_u32_e32 v0, 0x400, v0 +; GFX940-NEXT: v_mov_b32_e32 v1, 15 +; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: scratch_load_dword v0, v0, off offset:1028 sc0 sc1 +; GFX940-NEXT: scratch_load_dword v0, v0, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm ; @@ -3462,12 +3474,15 @@ ; GFX11-PAL-LABEL: store_load_vidx_sidx_offset: ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 4 ; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PAL-NEXT: v_add_lshl_u32 v0, s0, v0, 2 -; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:1028 dlc +; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-PAL-NEXT: v_add3_u32 v0, v1, v0, 0x400 +; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: scratch_load_b32 v0, v0, off offset:1028 glc dlc +; GFX11-PAL-NEXT: scratch_load_b32 v0, v0, off glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_endpgm ; GCN-LABEL: store_load_vidx_sidx_offset: @@ -4029,11 +4044,11 @@ ; GFX940-LABEL: store_load_i32_large_negative_unaligned: ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xef7f +; GFX940-NEXT: v_add_u32_e32 v0, 0xffffef7f, v0 ; GFX940-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-NEXT: scratch_store_byte v0, v1, s0 sc0 sc1 +; GFX940-NEXT: scratch_store_byte v0, v1, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: scratch_load_ubyte v0, v0, s0 sc0 sc1 +; GFX940-NEXT: scratch_load_ubyte v0, v0, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll @@ -310,10 +310,10 @@ ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: scratch_load_dword v0, v0, s4 nt +; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4 +; GFX940-NOTTGSPLIT-NEXT: scratch_load_dword v0, v0, off nt ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm @@ -322,21 +322,23 @@ ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: scratch_load_dword v0, v0, s4 nt +; GFX940-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4 +; GFX940-TGSPLIT-NEXT: scratch_load_dword v0, v0, off nt ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_nontemporal_load_1: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: scratch_load_b32 v0, v0, s2 slc dlc +; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX11-WGP-NEXT: scratch_load_b32 v0, v0, off slc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -344,11 +346,13 @@ ; ; GFX11-CU-LABEL: private_nontemporal_load_1: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: scratch_load_b32 v0, v0, s2 slc dlc +; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX11-CU-NEXT: scratch_load_b32 v0, v0, off slc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -657,24 +661,24 @@ ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-NOTTGSPLIT-NEXT: scratch_store_dword v0, v1, s4 nt +; GFX940-NOTTGSPLIT-NEXT: scratch_store_dword v0, v1, off nt ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: private_nontemporal_store_1: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-TGSPLIT-NEXT: scratch_store_dword v0, v1, s4 nt +; GFX940-TGSPLIT-NEXT: scratch_store_dword v0, v1, off nt ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_nontemporal_store_1: @@ -684,9 +688,10 @@ ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-WGP-NEXT: scratch_store_b32 v0, v1, s0 glc slc dlc +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 v0, v1, off glc slc dlc ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -697,9 +702,10 @@ ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-CU-NEXT: scratch_store_b32 v0, v1, s0 glc slc dlc +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 v0, v1, off glc slc dlc ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(5) %out) { Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll @@ -222,11 +222,13 @@ ; ; GFX11-WGP-LABEL: private_volatile_load_1: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: scratch_load_b32 v0, v0, s2 glc dlc +; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX11-WGP-NEXT: scratch_load_b32 v0, v0, off glc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -234,11 +236,13 @@ ; ; GFX11-CU-LABEL: private_volatile_load_1: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: scratch_load_b32 v0, v0, s2 glc dlc +; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX11-CU-NEXT: scratch_load_b32 v0, v0, off glc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -480,9 +484,10 @@ ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-WGP-NEXT: scratch_store_b32 v0, v1, s0 dlc +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 v0, v1, off dlc ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -494,9 +499,10 @@ ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-CU-NEXT: scratch_store_b32 v0, v1, s0 dlc +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 v0, v1, off dlc ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm