Index: llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1533,8 +1533,19 @@
 
       const SIInstrInfo *TII = Subtarget->getInstrInfo();
       if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
-        Addr = N0;
-        OffsetVal = COffsetVal;
+        // When value in 32-bit VGPR base can be negative calculate scratch
+        // offset using 32-bit add instruction,
+        // otherwise use vgpr base(unsigned) + offset.
+        if (FlatVariant == SIInstrFlags::FlatScratch) {
+          KnownBits AddrKnown = CurDAG->computeKnownBits(N0);
+          if (AddrKnown.isNonNegative()) {
+            Addr = N0;
+            OffsetVal = COffsetVal;
+          }
+        } else {
+          Addr = N0;
+          OffsetVal = COffsetVal;
+        }
       } else {
         // If the offset doesn't fit, put the low bits into the offset field and
         // add the rest.
Index: llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -734,11 +734,11 @@
 ; FLATSCR:       ; %bb.0: ; %bb
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    scratch_load_short_d16_hi v1, v0, off
+; FLATSCR-NEXT:    v_add_u32_e32 v2, 2, v0
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
-; FLATSCR-NEXT:    scratch_load_short_d16 v1, v0, off offset:2
+; FLATSCR-NEXT:    v_pk_sub_u16 v0, v1, -12 op_sel_hi:[1,0]
+; FLATSCR-NEXT:    scratch_load_short_d16 v0, v2, off
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    v_mov_b32_e32 v0, v1
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10_DEFAULT-LABEL: chain_hi_to_lo_private_other_dep:
@@ -758,11 +758,11 @@
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; FLATSCR_GFX10-NEXT:    scratch_load_short_d16_hi v1, v0, off
+; FLATSCR_GFX10-NEXT:    v_add_nc_u32_e32 v2, 2, v0
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR_GFX10-NEXT:    v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
-; FLATSCR_GFX10-NEXT:    scratch_load_short_d16 v1, v0, off offset:2
+; FLATSCR_GFX10-NEXT:    v_pk_sub_u16 v0, v1, -12 op_sel_hi:[1,0]
+; FLATSCR_GFX10-NEXT:    scratch_load_short_d16 v0, v2, off
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR_GFX10-NEXT:    v_mov_b32_e32 v0, v1
 ; FLATSCR_GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: chain_hi_to_lo_private_other_dep:
@@ -770,11 +770,11 @@
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_d16_hi_b16 v1, v0, off
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 2, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
-; GFX11-NEXT:    scratch_load_d16_b16 v1, v0, off offset:2
+; GFX11-NEXT:    v_pk_sub_u16 v0, v1, -12 op_sel_hi:[1,0]
+; GFX11-NEXT:    scratch_load_d16_b16 v0, v2, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %gep_lo = getelementptr inbounds i16, ptr addrspace(5) %ptr, i64 1
Index: llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
+++ llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
@@ -46,10 +46,11 @@
 ; GFX9-FLASTSCR-LABEL: private_load_2xi16_align2:
 ; GFX9-FLASTSCR:       ; %bb.0:
 ; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLASTSCR-NEXT:    scratch_load_ushort v1, v0, off
-; GFX9-FLASTSCR-NEXT:    scratch_load_ushort v2, v0, off offset:2
+; GFX9-FLASTSCR-NEXT:    v_add_u32_e32 v1, 2, v0
+; GFX9-FLASTSCR-NEXT:    scratch_load_ushort v2, v0, off
+; GFX9-FLASTSCR-NEXT:    scratch_load_ushort v3, v1, off
 ; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLASTSCR-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
+; GFX9-FLASTSCR-NEXT:    v_lshl_or_b32 v0, v3, 16, v2
 ; GFX9-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: private_load_2xi16_align2:
@@ -67,33 +68,36 @@
 ; GFX10-FLASTSCR:       ; %bb.0:
 ; GFX10-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLASTSCR-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-FLASTSCR-NEXT:    v_add_nc_u32_e32 v1, 2, v0
 ; GFX10-FLASTSCR-NEXT:    s_clause 0x1
-; GFX10-FLASTSCR-NEXT:    scratch_load_ushort v1, v0, off
-; GFX10-FLASTSCR-NEXT:    scratch_load_ushort v2, v0, off offset:2
+; GFX10-FLASTSCR-NEXT:    scratch_load_ushort v2, v0, off
+; GFX10-FLASTSCR-NEXT:    scratch_load_ushort v3, v1, off
 ; GFX10-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLASTSCR-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
+; GFX10-FLASTSCR-NEXT:    v_lshl_or_b32 v0, v3, 16, v2
 ; GFX10-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: private_load_2xi16_align2:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 2, v0
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_u16 v1, v0, off
-; GFX11-NEXT:    scratch_load_u16 v0, v0, off offset:2
+; GFX11-NEXT:    scratch_load_u16 v0, v0, off
+; GFX11-NEXT:    scratch_load_u16 v1, v1, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FLASTSCR-LABEL: private_load_2xi16_align2:
 ; GFX11-FLASTSCR:       ; %bb.0:
 ; GFX11-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLASTSCR-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FLASTSCR-NEXT:    v_add_nc_u32_e32 v1, 2, v0
 ; GFX11-FLASTSCR-NEXT:    s_clause 0x1
-; GFX11-FLASTSCR-NEXT:    scratch_load_u16 v1, v0, off
-; GFX11-FLASTSCR-NEXT:    scratch_load_u16 v0, v0, off offset:2
+; GFX11-FLASTSCR-NEXT:    scratch_load_u16 v0, v0, off
+; GFX11-FLASTSCR-NEXT:    scratch_load_u16 v1, v1, off
 ; GFX11-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLASTSCR-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-FLASTSCR-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX11-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
   %gep.p = getelementptr i16, ptr addrspace(5) %p, i64 1
   %p.0 = load i16, ptr addrspace(5) %p, align 2
@@ -142,10 +146,11 @@
 ; GFX9-FLASTSCR-LABEL: private_store_2xi16_align2:
 ; GFX9-FLASTSCR:       ; %bb.0:
 ; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLASTSCR-NEXT:    v_mov_b32_e32 v0, 1
-; GFX9-FLASTSCR-NEXT:    scratch_store_short v1, v0, off
-; GFX9-FLASTSCR-NEXT:    v_mov_b32_e32 v0, 2
-; GFX9-FLASTSCR-NEXT:    scratch_store_short v1, v0, off offset:2
+; GFX9-FLASTSCR-NEXT:    v_mov_b32_e32 v2, 1
+; GFX9-FLASTSCR-NEXT:    v_add_u32_e32 v0, 2, v1
+; GFX9-FLASTSCR-NEXT:    scratch_store_short v1, v2, off
+; GFX9-FLASTSCR-NEXT:    v_mov_b32_e32 v1, 2
+; GFX9-FLASTSCR-NEXT:    scratch_store_short v0, v1, off
 ; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -165,9 +170,10 @@
 ; GFX10-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLASTSCR-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-FLASTSCR-NEXT:    v_mov_b32_e32 v0, 1
-; GFX10-FLASTSCR-NEXT:    v_mov_b32_e32 v2, 2
+; GFX10-FLASTSCR-NEXT:    v_add_nc_u32_e32 v2, 2, v1
+; GFX10-FLASTSCR-NEXT:    v_mov_b32_e32 v3, 2
 ; GFX10-FLASTSCR-NEXT:    scratch_store_short v1, v0, off
-; GFX10-FLASTSCR-NEXT:    scratch_store_short v1, v2, off offset:2
+; GFX10-FLASTSCR-NEXT:    scratch_store_short v2, v3, off
 ; GFX10-FLASTSCR-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -175,11 +181,11 @@
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 1
-; GFX11-NEXT:    v_mov_b32_e32 v2, 2
+; GFX11-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v3, 2
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 2, v1
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    scratch_store_b16 v1, v0, off
-; GFX11-NEXT:    scratch_store_b16 v1, v2, off offset:2
+; GFX11-NEXT:    scratch_store_b16 v2, v3, off
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -187,11 +193,11 @@
 ; GFX11-FLASTSCR:       ; %bb.0:
 ; GFX11-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLASTSCR-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FLASTSCR-NEXT:    v_mov_b32_e32 v0, 1
-; GFX11-FLASTSCR-NEXT:    v_mov_b32_e32 v2, 2
+; GFX11-FLASTSCR-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v3, 2
+; GFX11-FLASTSCR-NEXT:    v_add_nc_u32_e32 v2, 2, v1
 ; GFX11-FLASTSCR-NEXT:    s_clause 0x1
 ; GFX11-FLASTSCR-NEXT:    scratch_store_b16 v1, v0, off
-; GFX11-FLASTSCR-NEXT:    scratch_store_b16 v1, v2, off offset:2
+; GFX11-FLASTSCR-NEXT:    scratch_store_b16 v2, v3, off
 ; GFX11-FLASTSCR-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
   %gep.r = getelementptr i16, ptr addrspace(5) %r, i64 1
Index: llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
+++ llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
@@ -11,14 +11,16 @@
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    scratch_load_ubyte v0, v0, off offset:1
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX10-NEXT:    scratch_load_ubyte v0, v0, off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i8_zext_v:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    scratch_load_u8 v0, v0, off offset:1
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX11-NEXT:    scratch_load_u8 v0, v0, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -37,14 +39,16 @@
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    scratch_load_sbyte v0, v0, off offset:1
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX10-NEXT:    scratch_load_sbyte v0, v0, off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i8_sext_v:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    scratch_load_i8 v0, v0, off offset:1
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX11-NEXT:    scratch_load_i8 v0, v0, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -63,14 +67,16 @@
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    scratch_load_ushort v0, v0, off offset:2
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 2, v0
+; GFX10-NEXT:    scratch_load_ushort v0, v0, off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i16_zext_v:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    scratch_load_u16 v0, v0, off offset:2
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 2, v0
+; GFX11-NEXT:    scratch_load_u16 v0, v0, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -89,14 +95,16 @@
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    scratch_load_sshort v0, v0, off offset:2
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 2, v0
+; GFX10-NEXT:    scratch_load_sshort v0, v0, off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i16_sext_v:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    scratch_load_i16 v0, v0, off offset:2
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 2, v0
+; GFX11-NEXT:    scratch_load_i16 v0, v0, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -115,16 +123,17 @@
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffff0000
-; GFX10-NEXT:    scratch_load_ubyte_d16 v3, v0, off offset:1
+; GFX10-NEXT:    scratch_load_ubyte_d16 v3, v0, off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v3
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_lo_v:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    v_mov_b32_e32 v3, 0xffff0000
-; GFX11-NEXT:    scratch_load_d16_u8 v3, v0, off offset:1
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0xffff0000 :: v_dual_add_nc_u32 v0, 1, v0
+; GFX11-NEXT:    scratch_load_d16_u8 v3, v0, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -145,16 +154,17 @@
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffff0000
-; GFX10-NEXT:    scratch_load_sbyte_d16 v3, v0, off offset:1
+; GFX10-NEXT:    scratch_load_sbyte_d16 v3, v0, off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v3
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_lo_v:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    v_mov_b32_e32 v3, 0xffff0000
-; GFX11-NEXT:    scratch_load_d16_i8 v3, v0, off offset:1
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0xffff0000 :: v_dual_add_nc_u32 v0, 1, v0
+; GFX11-NEXT:    scratch_load_d16_i8 v3, v0, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -175,16 +185,17 @@
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffff0000
-; GFX10-NEXT:    scratch_load_short_d16 v3, v0, off offset:2
+; GFX10-NEXT:    scratch_load_short_d16 v3, v0, off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v3
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i16_to_d16_lo_v:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    v_mov_b32_e32 v3, 0xffff0000
-; GFX11-NEXT:    scratch_load_d16_b16 v3, v0, off offset:2
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0xffff0000 :: v_dual_add_nc_u32 v0, 2, v0
+; GFX11-NEXT:    scratch_load_d16_b16 v3, v0, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -205,16 +216,17 @@
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, -1
-; GFX10-NEXT:    scratch_load_ubyte_d16_hi v3, v0, off offset:1
+; GFX10-NEXT:    scratch_load_ubyte_d16_hi v3, v0, off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v3
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_hi_v:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    v_mov_b32_e32 v3, -1
-; GFX11-NEXT:    scratch_load_d16_hi_u8 v3, v0, off offset:1
+; GFX11-NEXT:    v_dual_mov_b32 v3, -1 :: v_dual_add_nc_u32 v0, 1, v0
+; GFX11-NEXT:    scratch_load_d16_hi_u8 v3, v0, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -235,16 +247,17 @@
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, -1
-; GFX10-NEXT:    scratch_load_sbyte_d16_hi v3, v0, off offset:1
+; GFX10-NEXT:    scratch_load_sbyte_d16_hi v3, v0, off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v3
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_hi_v:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    v_mov_b32_e32 v3, -1
-; GFX11-NEXT:    scratch_load_d16_hi_i8 v3, v0, off offset:1
+; GFX11-NEXT:    v_dual_mov_b32 v3, -1 :: v_dual_add_nc_u32 v0, 1, v0
+; GFX11-NEXT:    scratch_load_d16_hi_i8 v3, v0, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -265,16 +278,17 @@
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, -1
-; GFX10-NEXT:    scratch_load_short_d16_hi v3, v0, off offset:2
+; GFX10-NEXT:    scratch_load_short_d16_hi v3, v0, off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v3
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i16_to_d16_hi_v:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    v_mov_b32_e32 v3, -1
-; GFX11-NEXT:    scratch_load_d16_hi_b16 v3, v0, off offset:2
+; GFX11-NEXT:    v_dual_mov_b32 v3, -1 :: v_dual_add_nc_u32 v0, 2, v0
+; GFX11-NEXT:    scratch_load_d16_hi_b16 v3, v0, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -295,15 +309,17 @@
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    flat_load_dword v0, v[0:1]
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 4, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    scratch_store_byte_d16_hi v2, v0, off offset:4
+; GFX10-NEXT:    scratch_store_byte_d16_hi v1, v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_store_b8_from_d16_hi_v:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 4, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_store_d16_hi_b8 v2, v0, off offset:4
+; GFX11-NEXT:    scratch_store_d16_hi_b8 v1, v0, off
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 bb:
@@ -322,15 +338,17 @@
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    flat_load_dword v0, v[0:1]
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 2, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    scratch_store_short_d16_hi v2, v0, off offset:2
+; GFX10-NEXT:    scratch_store_short_d16_hi v1, v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_store_b16_from_d16_hi_v:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 2, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v2, v0, off offset:2
+; GFX11-NEXT:    scratch_store_d16_hi_b16 v1, v0, off
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 bb:
@@ -719,8 +737,9 @@
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX10-NEXT:    scratch_load_ubyte v0, v0, off offset:1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    v_add3_u32 v0, s2, v0, 1
+; GFX10-NEXT:    scratch_load_ubyte v0, v0, off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v0
 ; GFX10-NEXT:    s_endpgm
@@ -749,8 +768,9 @@
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX10-NEXT:    scratch_load_sbyte v0, v0, off offset:1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    v_add3_u32 v0, s2, v0, 1
+; GFX10-NEXT:    scratch_load_sbyte v0, v0, off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v0
 ; GFX10-NEXT:    s_endpgm
@@ -779,8 +799,9 @@
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX10-NEXT:    scratch_load_ushort v0, v0, off offset:2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    v_add3_u32 v0, s2, v0, 2
+; GFX10-NEXT:    scratch_load_ushort v0, v0, off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v0
 ; GFX10-NEXT:    s_endpgm
@@ -809,8 +830,9 @@
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX10-NEXT:    scratch_load_sshort v0, v0, off offset:2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    v_add3_u32 v0, s2, v0, 2
+; GFX10-NEXT:    scratch_load_sshort v0, v0, off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v0
 ; GFX10-NEXT:    s_endpgm
@@ -839,9 +861,10 @@
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffff0000
-; GFX10-NEXT:    scratch_load_ubyte_d16 v3, v0, off offset:1
+; GFX10-NEXT:    v_add3_u32 v0, s2, v0, 1
+; GFX10-NEXT:    scratch_load_ubyte_d16 v3, v0, off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v3
 ; GFX10-NEXT:    s_endpgm
@@ -872,9 +895,10 @@
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffff0000
-; GFX10-NEXT:    scratch_load_sbyte_d16 v3, v0, off offset:1
+; GFX10-NEXT:    v_add3_u32 v0, s2, v0, 1
+; GFX10-NEXT:    scratch_load_sbyte_d16 v3, v0, off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v3
 ; GFX10-NEXT:    s_endpgm
@@ -905,9 +929,10 @@
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffff0000
-; GFX10-NEXT:    scratch_load_short_d16 v3, v0, off offset:2
+; GFX10-NEXT:    v_add3_u32 v0, s2, v0, 2
+; GFX10-NEXT:    scratch_load_short_d16 v3, v0, off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v3
 ; GFX10-NEXT:    s_endpgm
@@ -938,9 +963,10 @@
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, -1
-; GFX10-NEXT:    scratch_load_ubyte_d16_hi v3, v0, off offset:1
+; GFX10-NEXT:    v_add3_u32 v0, s2, v0, 1
+; GFX10-NEXT:    scratch_load_ubyte_d16_hi v3, v0, off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v3
 ; GFX10-NEXT:    s_endpgm
@@ -971,9 +997,10 @@
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, -1
-; GFX10-NEXT:    scratch_load_sbyte_d16_hi v3, v0, off offset:1
+; GFX10-NEXT:    v_add3_u32 v0, s2, v0, 1
+; GFX10-NEXT:    scratch_load_sbyte_d16_hi v3, v0, off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v3
 ; GFX10-NEXT:    s_endpgm
@@ -1004,9 +1031,10 @@
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, -1
-; GFX10-NEXT:    scratch_load_short_d16_hi v3, v0, off offset:2
+; GFX10-NEXT:    v_add3_u32 v0, s2, v0, 2
+; GFX10-NEXT:    scratch_load_short_d16_hi v3, v0, off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v3
 ; GFX10-NEXT:    s_endpgm
@@ -1037,9 +1065,10 @@
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    flat_load_dword v0, v[0:1]
-; GFX10-NEXT:    v_lshl_add_u32 v1, v2, 2, s2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v2
+; GFX10-NEXT:    v_add3_u32 v1, s2, v1, 4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    scratch_store_byte_d16_hi v1, v0, off offset:4
+; GFX10-NEXT:    scratch_store_byte_d16_hi v1, v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_store_b8_from_d16_hi_svs:
@@ -1068,9 +1097,10 @@
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    flat_load_dword v0, v[0:1]
-; GFX10-NEXT:    v_lshl_add_u32 v1, v2, 2, s2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v2
+; GFX10-NEXT:    v_add3_u32 v1, s2, v1, 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    scratch_store_short_d16_hi v1, v0, off offset:2
+; GFX10-NEXT:    scratch_store_short_d16_hi v1, v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_store_b16_from_d16_hi_svs:
Index: llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
+++ llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
@@ -47,14 +47,17 @@
 ; GFX11-SDAG:       ; %bb.0: ; %bb
 ; GFX11-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 dlc
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0
+; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v5, 2, v0
+; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v0, 4, v0
+; GFX11-SDAG-NEXT:    scratch_store_b8 v4, v1, off dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v5, v2, off dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, off dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
@@ -132,13 +135,16 @@
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 dlc
+; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v4, 1, v0
+; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v5, 2, v0
+; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v0, 4, v0
+; GFX11-SDAG-NEXT:    scratch_store_b8 v4, v1, off dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v5, v2, off dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, off dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
@@ -299,16 +305,18 @@
 ; GFX11-SDAG:       ; %bb.0: ; %bb
 ; GFX11-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 dlc
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0
+; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v5, 2, v0
+; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v0, 4, v0
+; GFX11-SDAG-NEXT:    scratch_store_b8 v4, v1, off dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v5, v2, off dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, off dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
@@ -393,11 +401,15 @@
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 dlc
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v4, 1, v0
+; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v5, 2, v0
+; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v0, 4, v0
+; GFX11-SDAG-NEXT:    scratch_store_b8 v4, v1, off dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v5, v2, off dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, off dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
@@ -563,18 +575,20 @@
 ; GFX11-SDAG-LABEL: soff4_voff1:
 ; GFX11-SDAG:       ; %bb.0: ; %bb
 ; GFX11-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v4, 4
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 2
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 1 :: v_dual_mov_b32 v3, 2
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v5, 4
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_add3_u32 v2, 4, s0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add3_u32 v1, 4, s0, v0
 ; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 4
-; GFX11-SDAG-NEXT:    scratch_store_b8 v2, v1, off offset:1 dlc
+; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v4, 1, v1
+; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v1, 2, v1
+; GFX11-SDAG-NEXT:    scratch_store_b8 v4, v2, off dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v2, v3, off offset:2 dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v1, v3, off dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v4, s0 offset:4 dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v5, s0 offset:4 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
@@ -653,17 +667,18 @@
 ; GFX11-SDAG-LABEL: soff4_voff2:
 ; GFX11-SDAG:       ; %bb.0: ; %bb
 ; GFX11-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 2 :: v_dual_lshlrev_b32 v0, 1, v0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 4
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_add3_u32 v3, 4, s0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_nc_u32_e64 v1, s0, 4
 ; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 4
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 dlc
+; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 2
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v2, s0 offset:1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v3, v2, off offset:2 dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v1, v3, off dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v4, s0 offset:4 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
Index: llvm/test/CodeGen/AMDGPU/flat-scratch.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -638,10 +638,11 @@
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
 ; GFX9-NEXT:    v_add_u32_e32 v1, 4, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 15
+; GFX9-NEXT:    v_sub_u32_e32 v0, 4, v0
 ; GFX9-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_sub_u32_e32 v0, 4, v0
-; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
+; GFX9-NEXT:    v_add_u32_e32 v0, 0x7c, v0
+; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -653,22 +654,24 @@
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 15
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, 4, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 4, v0
-; GFX10-NEXT:    scratch_store_dword v1, v2, off
+; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 4, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 4, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x7c, v1
+; GFX10-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
+; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_vindex_kernel:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 4, v0
-; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:4 dlc
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 4, v0
+; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_add_nc_u32 v1, 0x7c, v1
+; GFX11-NEXT:    scratch_store_b32 v0, v2, off offset:4 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, v1, off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 ;
@@ -687,7 +690,8 @@
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
 ; GFX9-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
+; GFX9-PAL-NEXT:    v_add_u32_e32 v0, 0x7c, v0
+; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
@@ -698,7 +702,8 @@
 ; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:4 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_sub_u32_e32 v0, 4, v0
-; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
+; GFX940-NEXT:    v_add_u32_e32 v0, 0x7c, v0
+; GFX940-NEXT:    scratch_load_dword v0, v0, off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
 ;
@@ -715,22 +720,24 @@
 ; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
 ; GFX10-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 15
-; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v1, 4, v0
-; GFX10-PAL-NEXT:    v_sub_nc_u32_e32 v0, 4, v0
-; GFX10-PAL-NEXT:    scratch_store_dword v1, v2, off
+; GFX10-PAL-NEXT:    v_sub_nc_u32_e32 v1, 4, v0
+; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v0, 4, v0
+; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x7c, v1
+; GFX10-PAL-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
+; GFX10-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-PAL-NEXT:    s_endpgm
 ;
 ; GFX11-PAL-LABEL: store_load_vindex_kernel:
 ; GFX11-PAL:       ; %bb.0: ; %bb
-; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v2, 4, v0
-; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:4 dlc
+; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v1, 4, v0
+; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_add_nc_u32 v1, 0x7c, v1
+; GFX11-PAL-NEXT:    scratch_store_b32 v0, v2, off offset:4 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
+; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, off glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_endpgm
 ; GCN-LABEL: store_load_vindex_kernel:
@@ -878,8 +885,9 @@
 ; GFX9-LABEL: private_ptr_foo:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41200000
-; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:4
+; GFX9-NEXT:    scratch_store_dword v0, v1, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -887,8 +895,9 @@
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0x41200000
-; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:4
+; GFX10-NEXT:    scratch_store_dword v0, v1, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -896,24 +905,26 @@
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0x41200000
-; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:4
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0x41200000 :: v_dual_add_nc_u32 v0, 4, v0
+; GFX11-NEXT:    scratch_store_b32 v0, v1, off
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-PAL-LABEL: private_ptr_foo:
 ; GFX9-PAL:       ; %bb.0:
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-PAL-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
-; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off offset:4
+; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: private_ptr_foo:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, 0x41200000
-; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:4
+; GFX940-NEXT:    scratch_store_dword v0, v1, off
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -921,8 +932,9 @@
 ; GFX10-PAL:       ; %bb.0:
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v0, 4, v0
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
-; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off offset:4
+; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -930,8 +942,8 @@
 ; GFX11-PAL:       ; %bb.0:
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
-; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:4
+; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 0x41200000 :: v_dual_add_nc_u32 v0, 4, v0
+; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
 ; GCN-LABEL: private_ptr_foo:
@@ -1683,10 +1695,11 @@
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, 0x104, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 15
+; GFX9-NEXT:    v_sub_u32_e32 v0, 0x104, v0
 ; GFX9-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_sub_u32_e32 v0, 0x104, v0
-; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
+; GFX9-NEXT:    v_add_u32_e32 v0, 0x7c, v0
+; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1700,23 +1713,26 @@
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX10-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x104, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0x104, v0
-; GFX10-NEXT:    scratch_store_dword v1, v2, off
+; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0x104, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x104, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x7c, v1
+; GFX10-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
+; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_vindex_small_offset_kernel:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 0x104, v0
-; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:260 dlc
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 0x104, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_add_nc_u32 v1, 0x7c, v1
+; GFX11-NEXT:    scratch_store_b32 v0, v2, off offset:260 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, v1, off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 ;
@@ -1735,10 +1751,11 @@
 ; GFX9-PAL-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    v_add_u32_e32 v1, 0x104, v0
+; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 0x104, v0
 ; GFX9-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 0x104, v0
-; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
+; GFX9-PAL-NEXT:    v_add_u32_e32 v0, 0x7c, v0
+; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
@@ -1751,7 +1768,8 @@
 ; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:260 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_sub_u32_e32 v0, 0x104, v0
-; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
+; GFX940-NEXT:    v_add_u32_e32 v0, 0x7c, v0
+; GFX940-NEXT:    scratch_load_dword v0, v0, off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
 ;
@@ -1771,11 +1789,12 @@
 ; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX1010-PAL-NEXT:    scratch_load_dword v3, off, vcc_lo offset:4 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x104, v0
-; GFX1010-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x104, v0
-; GFX1010-PAL-NEXT:    scratch_store_dword v1, v2, off
+; GFX1010-PAL-NEXT:    v_sub_nc_u32_e32 v1, 0x104, v0
+; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v0, 0x104, v0
+; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x7c, v1
+; GFX1010-PAL-NEXT:    scratch_store_dword v0, v2, off
 ; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1010-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
+; GFX1010-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010-PAL-NEXT:    s_endpgm
 ;
@@ -1794,23 +1813,26 @@
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX1030-PAL-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x104, v0
-; GFX1030-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x104, v0
-; GFX1030-PAL-NEXT:    scratch_store_dword v1, v2, off
+; GFX1030-PAL-NEXT:    v_sub_nc_u32_e32 v1, 0x104, v0
+; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v0, 0x104, v0
+; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x7c, v1
+; GFX1030-PAL-NEXT:    scratch_store_dword v0, v2, off
 ; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1030-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
+; GFX1030-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-PAL-NEXT:    s_endpgm
 ;
 ; GFX11-PAL-LABEL: store_load_vindex_small_offset_kernel:
 ; GFX11-PAL:       ; %bb.0: ; %bb
-; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0x104, v0
-; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:260 dlc
+; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v1, 0x104, v0
+; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_add_nc_u32 v1, 0x7c, v1
+; GFX11-PAL-NEXT:    scratch_store_b32 v0, v2, off offset:260 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
+; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, off glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_endpgm
 bb:
@@ -2739,10 +2761,11 @@
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, 0x4004, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 15
+; GFX9-NEXT:    v_sub_u32_e32 v0, 0x4004, v0
 ; GFX9-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_sub_u32_e32 v0, 0x4004, v0
-; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
+; GFX9-NEXT:    v_add_u32_e32 v0, 0x7c, v0
+; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -2756,24 +2779,27 @@
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX10-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0x4004, v0
-; GFX10-NEXT:    scratch_store_dword v1, v2, off
+; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0x4004, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x4004, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x7c, v1
+; GFX10-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
+; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_vindex_large_offset_kernel:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-NEXT:    s_movk_i32 vcc_lo, 0x4004
 ; GFX11-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 0x4004, v0
-; GFX11-NEXT:    scratch_store_b32 v0, v1, vcc_lo dlc
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 0x4004, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_add_nc_u32 v1, 0x7c, v1
+; GFX11-NEXT:    scratch_store_b32 v0, v2, vcc_lo dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, v1, off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 ;
@@ -2792,10 +2818,11 @@
 ; GFX9-PAL-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    v_add_u32_e32 v1, 0x4004, v0
+; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 0x4004, v0
 ; GFX9-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 0x4004, v0
-; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
+; GFX9-PAL-NEXT:    v_add_u32_e32 v0, 0x7c, v0
+; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
@@ -2809,7 +2836,8 @@
 ; GFX940-NEXT:    scratch_store_dword v0, v1, vcc_hi sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_sub_u32_e32 v0, 0x4004, v0
-; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
+; GFX940-NEXT:    v_add_u32_e32 v0, 0x7c, v0
+; GFX940-NEXT:    scratch_load_dword v0, v0, off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
 ;
@@ -2829,11 +2857,12 @@
 ; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX1010-PAL-NEXT:    scratch_load_dword v3, off, vcc_lo offset:4 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v0
-; GFX1010-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x4004, v0
-; GFX1010-PAL-NEXT:    scratch_store_dword v1, v2, off
+; GFX1010-PAL-NEXT:    v_sub_nc_u32_e32 v1, 0x4004, v0
+; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v0, 0x4004, v0
+; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x7c, v1
+; GFX1010-PAL-NEXT:    scratch_store_dword v0, v2, off
 ; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1010-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
+; GFX1010-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010-PAL-NEXT:    s_endpgm
 ;
@@ -2852,24 +2881,27 @@
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX1030-PAL-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v0
-; GFX1030-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x4004, v0
-; GFX1030-PAL-NEXT:    scratch_store_dword v1, v2, off
+; GFX1030-PAL-NEXT:    v_sub_nc_u32_e32 v1, 0x4004, v0
+; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v0, 0x4004, v0
+; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x7c, v1
+; GFX1030-PAL-NEXT:    scratch_store_dword v0, v2, off
 ; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1030-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
+; GFX1030-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-PAL-NEXT:    s_endpgm
 ;
 ; GFX11-PAL-LABEL: store_load_vindex_large_offset_kernel:
 ; GFX11-PAL:       ; %bb.0: ; %bb
-; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4004
 ; GFX11-PAL-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0x4004, v0
-; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, vcc_lo dlc
+; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v1, 0x4004, v0
+; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_add_nc_u32 v1, 0x7c, v1
+; GFX11-PAL-NEXT:    scratch_store_b32 v0, v2, vcc_lo dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
+; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, off glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_endpgm
 bb:
@@ -3327,16 +3359,17 @@
 ; GFX9-LABEL: store_load_vidx_sidx_offset:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 4
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
 ; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
+; GFX9-NEXT:    v_add_u32_e32 v0, 0x400, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 15
-; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:1024
+; GFX9-NEXT:    scratch_store_dword v0, v1, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc
+; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -3347,13 +3380,14 @@
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
 ; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v1, 15
+; GFX10-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, s0, v0
-; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
-; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:1024
+; GFX10-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
+; GFX10-NEXT:    v_add3_u32 v0, v1, v0, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v1, 15
+; GFX10-NEXT:    scratch_store_dword v0, v1, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc dlc
+; GFX10-NEXT:    scratch_load_dword v0, v0, off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -3380,12 +3414,13 @@
 ; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
 ; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
 ; GFX9-PAL-NEXT:    v_add_u32_e32 v0, s0, v0
-; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
 ; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
+; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
+; GFX9-PAL-NEXT:    v_add_u32_e32 v0, 0x400, v0
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
-; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off offset:1024
+; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc
+; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
@@ -3413,13 +3448,14 @@
 ; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
 ; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
 ; GFX10-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
-; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
-; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off offset:1024
+; GFX10-PAL-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
+; GFX10-PAL-NEXT:    v_add3_u32 v0, v1, v0, 0x400
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
+; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc dlc
+; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off glc dlc
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-PAL-NEXT:    s_endpgm
 ;
@@ -3874,10 +3910,11 @@
 ; GFX10:       ; %bb.0: ; %bb
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, -1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 1
-; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:-1
+; GFX10-NEXT:    scratch_store_byte v0, v1, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    scratch_load_ubyte v0, v0, off offset:-1 glc dlc
+; GFX10-NEXT:    scratch_load_ubyte v0, v0, off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3885,10 +3922,10 @@
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 1
-; GFX11-NEXT:    scratch_store_b8 v0, v1, off offset:-1 dlc
+; GFX11-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, -1, v0
+; GFX11-NEXT:    scratch_store_b8 v0, v1, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_u8 v0, v0, off offset:-1 glc dlc
+; GFX11-NEXT:    scratch_load_u8 v0, v0, off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3914,37 +3951,26 @@
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1010-PAL-LABEL: store_load_i32_negative_unaligned:
-; GFX1010-PAL:       ; %bb.0: ; %bb
-; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v0, -1, v0
-; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 1
-; GFX1010-PAL-NEXT:    scratch_store_byte v0, v1, off
-; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1010-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc dlc
-; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1010-PAL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-PAL-LABEL: store_load_i32_negative_unaligned:
-; GFX1030-PAL:       ; %bb.0: ; %bb
-; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 1
-; GFX1030-PAL-NEXT:    scratch_store_byte v0, v1, off offset:-1
-; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1030-PAL-NEXT:    scratch_load_ubyte v0, v0, off offset:-1 glc dlc
-; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-PAL-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-PAL-LABEL: store_load_i32_negative_unaligned:
+; GFX10-PAL:       ; %bb.0: ; %bb
+; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v0, -1, v0
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX10-PAL-NEXT:    scratch_store_byte v0, v1, off
+; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc dlc
+; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-PAL-LABEL: store_load_i32_negative_unaligned:
 ; GFX11-PAL:       ; %bb.0: ; %bb
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 1
-; GFX11-PAL-NEXT:    scratch_store_b8 v0, v1, off offset:-1 dlc
+; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, -1, v0
+; GFX11-PAL-NEXT:    scratch_store_b8 v0, v1, off dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-PAL-NEXT:    scratch_load_u8 v0, v0, off offset:-1 glc dlc
+; GFX11-PAL-NEXT:    scratch_load_u8 v0, v0, off glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
Index: llvm/test/CodeGen/AMDGPU/memory_clause.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/memory_clause.ll
+++ llvm/test/CodeGen/AMDGPU/memory_clause.ll
@@ -209,20 +209,26 @@
 ; GCN-SCRATCH-NEXT:    v_lshlrev_b32_e32 v2, 4, v31
 ; GCN-SCRATCH-NEXT:    v_and_b32_e32 v18, 0x3ff0, v2
 ; GCN-SCRATCH-NEXT:    v_add_nc_u32_e32 v0, v0, v18
+; GCN-SCRATCH-NEXT:    v_add_nc_u32_e32 v6, 16, v0
+; GCN-SCRATCH-NEXT:    v_add_nc_u32_e32 v10, 32, v0
+; GCN-SCRATCH-NEXT:    v_add_nc_u32_e32 v14, 48, v0
 ; GCN-SCRATCH-NEXT:    s_clause 0x3
 ; GCN-SCRATCH-NEXT:    scratch_load_dwordx4 v[2:5], v0, off
-; GCN-SCRATCH-NEXT:    scratch_load_dwordx4 v[6:9], v0, off offset:16
-; GCN-SCRATCH-NEXT:    scratch_load_dwordx4 v[10:13], v0, off offset:32
-; GCN-SCRATCH-NEXT:    scratch_load_dwordx4 v[14:17], v0, off offset:48
+; GCN-SCRATCH-NEXT:    scratch_load_dwordx4 v[6:9], v6, off
+; GCN-SCRATCH-NEXT:    scratch_load_dwordx4 v[10:13], v10, off
+; GCN-SCRATCH-NEXT:    scratch_load_dwordx4 v[14:17], v14, off
 ; GCN-SCRATCH-NEXT:    v_add_nc_u32_e32 v0, v1, v18
+; GCN-SCRATCH-NEXT:    v_add_nc_u32_e32 v1, 16, v0
+; GCN-SCRATCH-NEXT:    v_add_nc_u32_e32 v18, 32, v0
+; GCN-SCRATCH-NEXT:    v_add_nc_u32_e32 v19, 48, v0
 ; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-SCRATCH-NEXT:    scratch_store_dwordx4 v0, v[2:5], off
 ; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(2)
-; GCN-SCRATCH-NEXT:    scratch_store_dwordx4 v0, v[6:9], off offset:16
+; GCN-SCRATCH-NEXT:    scratch_store_dwordx4 v1, v[6:9], off
 ; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(1)
-; GCN-SCRATCH-NEXT:    scratch_store_dwordx4 v0, v[10:13], off offset:32
+; GCN-SCRATCH-NEXT:    scratch_store_dwordx4 v18, v[10:13], off
 ; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GCN-SCRATCH-NEXT:    scratch_store_dwordx4 v0, v[14:17], off offset:48
+; GCN-SCRATCH-NEXT:    scratch_store_dwordx4 v19, v[14:17], off
 ; GCN-SCRATCH-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
 bb: