diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -216,16 +216,6 @@
       std::vector<GlobalVariable *> KernelUsedVariables =
           AMDGPU::findLDSVariablesToLower(M, &F);
 
-      // Replace all constant uses with instructions if they belong to the
-      // current kernel. Unnecessary, removing will cause test churn.
-      for (GlobalVariable *GV : KernelUsedVariables) {
-        for (User *U : make_early_inc_range(GV->users())) {
-          if (ConstantExpr *C = dyn_cast<ConstantExpr>(U))
-            AMDGPU::replaceConstantUsesInFunction(C, &F);
-        }
-        GV->removeDeadConstantUsers();
-      }
-
       if (!KernelUsedVariables.empty()) {
         std::string VarName =
             (Twine("llvm.amdgcn.kernel.") + F.getName() + ".lds").str();
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -1002,7 +1002,7 @@
 ; CI:       ; %bb.0:
 ; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    s_mov_b32 m0, -1
-; CI-NEXT:    ds_read_b128 v[0:3], v0
+; CI-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    s_mov_b32 s2, -1
@@ -1012,16 +1012,27 @@
 ; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: load_misaligned64_constant_offsets:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    ds_read_b128 v[0:3], v4
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
-; GFX9-NEXT:    s_endpgm
+; GFX9-ALIGNED-LABEL: load_misaligned64_constant_offsets:
+; GFX9-ALIGNED:       ; %bb.0:
+; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-ALIGNED-NEXT:    ds_read2_b64 v[0:3], v4 offset1:1
+; GFX9-ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-ALIGNED-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-ALIGNED-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-ALIGNED-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX9-ALIGNED-NEXT:    s_endpgm
+;
+; GFX9-UNALIGNED-LABEL: load_misaligned64_constant_offsets:
+; GFX9-UNALIGNED:       ; %bb.0:
+; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-UNALIGNED-NEXT:    ds_read_b128 v[0:3], v4
+; GFX9-UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-UNALIGNED-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-UNALIGNED-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-UNALIGNED-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX9-UNALIGNED-NEXT:    s_endpgm
   %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
   %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
   %sum = add i64 %val0, %val1
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
@@ -816,22 +816,31 @@
 define amdgpu_kernel void @store_misaligned64_constant_offsets() {
 ; CI-LABEL: store_misaligned64_constant_offsets:
 ; CI:       ; %bb.0:
-; CI-NEXT:    v_mov_b32_e32 v0, 0x7b
-; CI-NEXT:    v_mov_b32_e32 v1, 0
-; CI-NEXT:    v_mov_b32_e32 v2, v0
-; CI-NEXT:    v_mov_b32_e32 v3, v1
+; CI-NEXT:    s_mov_b64 s[0:1], 0x7b
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v2, 0
+; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    s_mov_b32 m0, -1
-; CI-NEXT:    ds_write_b128 v1, v[0:3]
+; CI-NEXT:    ds_write2_b64 v2, v[0:1], v[0:1] offset1:1
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: store_misaligned64_constant_offsets:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-NEXT:    ds_write_b128 v1, v[0:3]
-; GFX9-NEXT:    s_endpgm
+; GFX9-ALIGNED-LABEL: store_misaligned64_constant_offsets:
+; GFX9-ALIGNED:       ; %bb.0:
+; GFX9-ALIGNED-NEXT:    s_mov_b64 s[0:1], 0x7b
+; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-ALIGNED-NEXT:    ds_write2_b64 v2, v[0:1], v[0:1] offset1:1
+; GFX9-ALIGNED-NEXT:    s_endpgm
+;
+; GFX9-UNALIGNED-LABEL: store_misaligned64_constant_offsets:
+; GFX9-UNALIGNED:       ; %bb.0:
+; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-UNALIGNED-NEXT:    ds_write_b128 v1, v[0:3]
+; GFX9-UNALIGNED-NEXT:    s_endpgm
   store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
   store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/loop_break.ll b/llvm/test/CodeGen/AMDGPU/loop_break.ll
--- a/llvm/test/CodeGen/AMDGPU/loop_break.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop_break.ll
@@ -209,7 +209,11 @@
 ; GCN-NEXT:    ; implicit-def: $sgpr6
 ; GCN-NEXT:  .LBB2_1: ; %bb1
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    s_or_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cmp_lg_u32 lds@abs32@lo, 4
+; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GCN-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GCN-NEXT:    s_cmp_gt_i32 s6, -1
 ; GCN-NEXT:    s_cbranch_scc1 .LBB2_3
 ; GCN-NEXT:  ; %bb.2: ; %bb4
diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll
@@ -24,9 +24,7 @@
 ;.
 define amdgpu_kernel void @k0(i64 %x) {
 ; CHECK-LABEL: @k0(
-; CHECK-NEXT:    %1 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0), i32 0, i32 0
-; CHECK-NEXT:    %2 = addrspacecast i8 addrspace(3)* %1 to i8*
-; CHECK-NEXT:    %ptr = getelementptr inbounds i8, i8* %2, i64 %x
+; CHECK-NEXT:    %ptr = getelementptr inbounds i8, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(3)* @lds.1, i32 0, i32 0) to i8*), i64 %x
 ; CHECK-NEXT:    store i8 1, i8* %ptr, align 1
 ; CHECK-NEXT:    ret void
 ;
@@ -37,9 +35,7 @@
 
 define amdgpu_kernel void @k1(i64 %x) {
 ; CHECK-LABEL: @k1(
-; CHECK-NEXT:    %1 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0), i32 0, i32 0
-; CHECK-NEXT:    %2 = addrspacecast i8 addrspace(3)* %1 to i8*
-; CHECK-NEXT:    %ptr = getelementptr inbounds i8, i8* %2, i64 %x
+; CHECK-NEXT:    %ptr = getelementptr inbounds i8, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(3)* @lds.1, i32 0, i32 0) to i8*), i64 %x
 ; CHECK-NEXT:    store i8 1, i8* %ptr, align 1
 ; CHECK-NEXT:    ret void
 ;
@@ -71,14 +67,10 @@
 ; Use constant twice from the same kernel but a different other constant.
 define amdgpu_kernel void @k3(i64 %x) {
 ; CHECK-LABEL: @k3(
-; CHECK-NEXT:    %1 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k3.lds.t, %llvm.amdgcn.kernel.k3.lds.t addrspace(3)* @llvm.amdgcn.kernel.k3.lds, i32 0, i32 0), i32 0, i32 16
-; CHECK-NEXT:    %2 = bitcast i8 addrspace(3)* %1 to i64 addrspace(3)*
-; CHECK-NEXT:    %ptr1 = addrspacecast i64 addrspace(3)* %2 to i64*
+; CHECK-NEXT:    %ptr1 = addrspacecast i64 addrspace(3)* bitcast (i8 addrspace(3)* getelementptr inbounds ([32 x i8], [32 x i8] addrspace(3)* @lds.3, i32 0, i32 16) to i64 addrspace(3)*) to i64*
 ; CHECK-NEXT:    store i64 1, i64* %ptr1, align 1
-; CHECK-NEXT:    %3 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k3.lds.t, %llvm.amdgcn.kernel.k3.lds.t addrspace(3)* @llvm.amdgcn.kernel.k3.lds, i32 0, i32 0), i32 0, i32 24
-; CHECK-NEXT:    %4 = bitcast i8 addrspace(3)* %3 to i64 addrspace(3)*
-; CHECK-NEXT:    %ptr2 = addrspacecast i64 addrspace(3)* %4 to i64*
-; CHECK-NEXT:    store i64 2, i64* %ptr2, align 8
+; CHECK-NEXT:    %ptr2 = addrspacecast i64 addrspace(3)* bitcast (i8 addrspace(3)* getelementptr inbounds ([32 x i8], [32 x i8] addrspace(3)* @lds.3, i32 0, i32 24) to i64 addrspace(3)*) to i64*
+; CHECK-NEXT:    store i64 2, i64* %ptr2, align 1
 ; CHECK-NEXT:    ret void
 ;
   %ptr1 = addrspacecast i64 addrspace(3)* bitcast (i8 addrspace(3)* getelementptr inbounds ([32 x i8], [32 x i8] addrspace(3)* @lds.3, i32 0, i32 16) to i64 addrspace(3)*) to i64*
@@ -91,9 +83,7 @@
 ; @lds.1 is used from constant expressions in different kernels.
 define amdgpu_kernel void @k4(i64 %x) {
 ; CHECK-LABEL: @k4(
-; CHECK-NEXT:    %1 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k4.lds.t, %llvm.amdgcn.kernel.k4.lds.t addrspace(3)* @llvm.amdgcn.kernel.k4.lds, i32 0, i32 0), i32 0, i32 0
-; CHECK-NEXT:    %2 = addrspacecast i8 addrspace(3)* %1 to i8*
-; CHECK-NEXT:    %ptr = getelementptr inbounds i8, i8* %2, i64 %x
+; CHECK-NEXT:    %ptr = getelementptr inbounds i8, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(3)* @lds.1, i32 0, i32 0) to i8*), i64 %x
 ; CHECK-NEXT:    store i8 1, i8* %ptr, align 1
 ; CHECK-NEXT:    ret void
 ;
@@ -107,9 +97,8 @@
 ; Multiple constexpr use in a same instruction.
 define amdgpu_kernel void @k5() {
 ; CHECK-LABEL: @k5(
-; CHECK-NEXT: %1 = addrspacecast [505 x i32] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k5.lds.t, %llvm.amdgcn.kernel.k5.lds.t addrspace(3)* @llvm.amdgcn.kernel.k5.lds, i32 0, i32 0) to [505 x i32]*
-; CHECK-NEXT: %2 = getelementptr inbounds [505 x i32], [505 x i32]* %1, i64 0, i64 0
-; CHECK-NEXT: call void undef(i32* %2, i32* %2)
+; CHECK-NEXT:    call void undef(i32* getelementptr inbounds ([505 x i32], [505 x i32]* addrspacecast ([505 x i32] addrspace(3)* @lds.4 to [505 x i32]*), i64 0, i64 0), i32* getelementptr inbounds ([505 x i32], [505 x i32]* addrspacecast ([505 x i32] addrspace(3)* @lds.4 to [505 x i32]*), i64 0, i64 0))
+; CHECK-NEXT:    ret void
 ;
   call void undef(i32* getelementptr inbounds ([505 x i32], [505 x i32]* addrspacecast ([505 x i32] addrspace(3)* @lds.4 to [505 x i32]*), i64 0, i64 0), i32* getelementptr inbounds ([505 x i32], [505 x i32]* addrspacecast ([505 x i32] addrspace(3)* @lds.4 to [505 x i32]*), i64 0, i64 0))
   ret void
@@ -122,10 +111,8 @@
 ; expression operands of store should be replaced by corresponding instruction sequence.
 define amdgpu_kernel void @k6() {
 ; CHECK-LABEL: @k6(
-; CHECK-NEXT: %1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k6.lds.t, %llvm.amdgcn.kernel.k6.lds.t addrspace(3)* @llvm.amdgcn.kernel.k6.lds, i32 0, i32 0), i32 0, i32 2
-; CHECK-NEXT: %2 = ptrtoint i32 addrspace(3)* %1 to i32
-; CHECK-NEXT: store i32 %2, i32 addrspace(3)* %1, align 8
-; CHECK-NEXT: ret void
+; CHECK-NEXT:    store i32 ptrtoint (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @lds.5, i32 0, i32 2) to i32), i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @lds.5, i32 0, i32 2), align 4
+; CHECK-NEXT:    ret void
 ;
   store i32 ptrtoint (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @lds.5, i32 0, i32 2) to i32), i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @lds.5, i32 0, i32 2)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll
@@ -28,9 +28,7 @@
 ; SUPER-ALIGN_OFF: @llvm.amdgcn.kernel.k4.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k4.lds.t undef, align 4
 
 ; CHECK-LABEL: @k1
-; CHECK:  %1 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0), i32 0, i32 0
-; CHECK:  %2 = addrspacecast i8 addrspace(3)* %1 to i8*
-; CHECK:  %ptr = getelementptr inbounds i8, i8* %2, i64 %x
+; CHECK:  %ptr = getelementptr inbounds i8, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([32 x i8], [32 x i8] addrspace(3)* @lds.1, i32 0, i32 0) to i8*), i64 %x
 ; CHECK:  store i8 1, i8* %ptr, align 1
 define amdgpu_kernel void @k1(i64 %x) {
   %ptr = getelementptr inbounds i8, i8* addrspacecast ([32 x i8] addrspace(3)* @lds.1 to i8*), i64 %x
@@ -58,7 +56,14 @@
 ; Check that alignment is propagated to uses for arrays.
 
 ; CHECK-LABEL: @k3
-; CHECK:  store i32 1, i32 addrspace(3)* %ptr1, align 8
+; CHECK:  %ptr0 = getelementptr inbounds i64, i64 addrspace(3)* getelementptr inbounds ([32 x i64], [32 x i64] addrspace(3)* @lds.4, i32 0, i32 0), i64 0
+; CHECK:  store i64 0, i64 addrspace(3)* %ptr0, align 8
+; CHECK:  %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(3)* @lds.5, i32 0, i32 0), i64 2
+; CHECK:  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(3)* @lds.5, i32 0, i32 0), i64 3
+; CHECK:  %ptr3 = getelementptr inbounds i32, i32 addrspace(3)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(3)* @lds.5, i32 0, i32 0), i64 4
+; CHECK:  %ptr4 = getelementptr inbounds i32, i32 addrspace(3)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(3)* @lds.5, i32 0, i32 0), i64 5
+; CHECK:  %ptr5 = getelementptr inbounds i32, i32 addrspace(3)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(3)* @lds.5, i32 0, i32 0), i64 %x
+; CHECK:  store i32 1, i32 addrspace(3)* %ptr1, align 4
 ; CHECK:  store i32 2, i32 addrspace(3)* %ptr2, align 4
 ; SUPER-ALIGN_ON:  store i32 3, i32 addrspace(3)* %ptr3, align 16
 ; SUPER-ALIGN_OFF: store i32 3, i32 addrspace(3)* %ptr3, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll
@@ -5,11 +5,10 @@
 @b = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
 
 ; CHECK-LABEL: @no_clobber_ds_load_stores_x2_preexisting_aa
-; CHECK: store i32 1, i32 addrspace(3)* %0, align 16, !tbaa !0, !noalias !5
-; CHECK: %val.a = load i32, i32 addrspace(3)* %gep.a, align 4, !tbaa !0, !noalias !5
-; CHECK: store i32 2, i32 addrspace(3)* %1, align 16, !tbaa !0, !noalias !5
-; CHECK: %val.b = load i32, i32 addrspace(3)* %gep.b, align 4, !tbaa !0, !noalias !5
-
+; CHECK: store i32 1, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 0), align 4, !tbaa !0, !alias.scope !5, !noalias !8  
+; CHECK: %val.a = load i32, i32 addrspace(3)* %gep.a, align 4, !tbaa !0, !noalias !10
+; CHECK: store i32 2, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 0), align 4, !tbaa !0, !alias.scope !8, !noalias !5   
+; CHECK: %val.b = load i32, i32 addrspace(3)* %gep.b, align 4, !tbaa !0, !noalias !10
 define amdgpu_kernel void @no_clobber_ds_load_stores_x2_preexisting_aa(i32 addrspace(1)* %arg, i32 %i) {
 bb:
   store i32 1, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 0), align 4, !alias.scope !0, !noalias !3, !tbaa !5
@@ -39,4 +38,9 @@
 ; CHECK:!2 = !{!"int", !3, i64 0}
 ; CHECK:!3 = !{!"omnipotent char", !4, i64 0}
 ; CHECK:!4 = !{!"Simple C++ TBAA"}
-; CHECK:!5 = !{}
+; CHECK:!5 = !{!6}
+; CHECK:!6 = distinct !{!6, !7}
+; CHECK:!7 = distinct !{!7}
+; CHECK:!8 = !{!9}
+; CHECK:!9 = distinct !{!9, !7}
+; CHECK:!10 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
@@ -11,17 +11,14 @@
 ; are not adjacent. They are only moved later by MachineScheduler.
 
 ; GCN-LABEL: {{^}}no_clobber_ds_load_stores_x2:
-; GCN: ds_write_b32
-; GCN: ds_write_b32
-; GCN: ds_read_b32
-; GCN: ds_read_b32
+; GCN: ds_read2st64_b32
+; GCN: ds_write2st64
 
 ; CHECK-LABEL: @no_clobber_ds_load_stores_x2
-; CHECK: store i32 1, i32 addrspace(3)* %0, align 16, !alias.scope !0, !noalias !3
+; CHECK: store i32 1, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 0), align 4
 ; CHECK: %val.a = load i32, i32 addrspace(3)* %gep.a, align 4, !alias.scope !0, !noalias !3
-; CHECK: store i32 2, i32 addrspace(3)* %1, align 16, !alias.scope !3, !noalias !0
+; CHECK: store i32 2, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 0), align 4
 ; CHECK: %val.b = load i32, i32 addrspace(3)* %gep.b, align 4, !alias.scope !3, !noalias !0
-
 define amdgpu_kernel void @no_clobber_ds_load_stores_x2(i32 addrspace(1)* %arg, i32 %i) {
 bb:
   store i32 1, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 0), align 4
@@ -37,20 +34,17 @@
 
 ; GCN-LABEL: {{^}}no_clobber_ds_load_stores_x3:
 ; GCN-DAG: ds_write_b32
-; GCN-DAG: ds_write_b32
-; GCN-DAG: ds_write_b32
-; GCN-DAG: ds_read_b32
-; GCN-DAG: ds_read_b32
+; GCN-DAG: ds_read2st64_b32
 ; GCN-DAG: ds_read_b32
+; GCN-DAG: ds_write2st64_b32
 
 ; CHECK-LABEL: @no_clobber_ds_load_stores_x3
-; CHECK: store i32 1, i32 addrspace(3)* %0, align 16, !alias.scope !5, !noalias !8
-; CHECK: %val.a = load i32, i32 addrspace(3)* %gep.a, align 4, !alias.scope !5, !noalias !8
-; CHECK: store i32 2, i32 addrspace(3)* %1, align 16, !alias.scope !11, !noalias !12
-; CHECK: %val.b = load i32, i32 addrspace(3)* %gep.b, align 4, !alias.scope !11, !noalias !12
-; CHECK: store i32 3, i32 addrspace(3)* %2, align 16, !alias.scope !13, !noalias !14
-; CHECK: %val.c = load i32, i32 addrspace(3)* %gep.c, align 4, !alias.scope !13, !noalias !14
-
+; CHECK:  store i32 1, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 0), align 4
+; CHECK:  %val.a = load i32, i32 addrspace(3)* %gep.a, align 4, !alias.scope !5, !noalias !8
+; CHECK:  store i32 2, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 0), align 4
+; CHECK:  %val.b = load i32, i32 addrspace(3)* %gep.b, align 4, !alias.scope !11, !noalias !12
+; CHECK:  store i32 3, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @c, i32 0, i32 0), align 4
+; CHECK:  %val.c = load i32, i32 addrspace(3)* %gep.c, align 4, !alias.scope !13, !noalias !14
 define amdgpu_kernel void @no_clobber_ds_load_stores_x3(i32 addrspace(1)* %arg, i32 %i) {
 bb:
   store i32 1, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 0), align 4
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
@@ -35,21 +35,11 @@
   ret void
 }
 
-; CHECK-LABEL: @timestwo() #0
-; CHECK-NOT: call void @llvm.donothing()
-; CHECK: %1 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)*
-; CHECK: %2 = addrspacecast i32 addrspace(3)* %1 to i32*
-; CHECK: %3 = ptrtoint i32* %2 to i64
-; CHECK: %4 = add i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64), %3
-; CHECK: %5 = inttoptr i64 %4 to i32*
-; CHECK: %ld = load i32, i32* %5, align 4
-; CHECK: %mul = mul i32 %ld, 2
-; CHECK: %6 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)*
-; CHECK: %7 = addrspacecast i32 addrspace(3)* %6 to i32*
-; CHECK: %8 = ptrtoint i32* %7 to i64
-; CHECK: %9 = add i64 %8, ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64)
-; CHECK: %10 = inttoptr i64 %9 to i32*
-; CHECK: store i32 %mul, i32* %10, align 4
+; CHECK-LABEL: define amdgpu_kernel void @timestwo() #0 {
+; CHECK-NOT:   call void @llvm.donothing()
+; CHECK:   %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+; CHECK:   %mul = mul i32 %ld, 2
+; CHECK:   store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
 define amdgpu_kernel void @timestwo() {
   %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
   %mul = mul i32 %ld, 2