diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
@@ -23,6 +23,10 @@
 
 Align getAlign(DataLayout const &DL, const GlobalVariable *GV);
 
+/// \returns true if a given global variable \p GV (or its global users) appear
+/// as an use within some instruction (either from kernel or from non-kernel).
+bool hasUserInstruction(const GlobalVariable *GV);
+
 /// \returns true if an LDS global requres lowering to a module LDS structure
 /// if \p F is not given. If \p F is given it must be a kernel and function
 /// \returns true if an LDS global is directly used from that kernel and it
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
@@ -42,60 +42,114 @@
   return false;
 }
 
+bool hasUserInstruction(const GlobalVariable *GV) {
+  SmallPtrSet<const User *, 8> Visited;
+  SmallVector<const User *, 16> Stack(GV->users());
+  while (!Stack.empty()) {
+    const User *U = Stack.pop_back_val();
+
+    if (!Visited.insert(U).second) {
+      continue;
+    }
+
+    if (isa<GlobalVariable>(U) || isa<Constant>(U)) {
+      append_range(Stack, U->users());
+      continue;
+    }
+
+    if (isa<Instruction>(U)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
 bool shouldLowerLDSToStruct(const SmallPtrSetImpl<GlobalValue *> &UsedList,
                             const GlobalVariable &GV, const Function *F) {
-  // Any LDS variable can be lowered by moving into the created struct
-  // Each variable so lowered is allocated in every kernel, so variables
-  // whose users are all known to be safe to lower without the transform
-  // are left unchanged.
+  // We are not interested in kernel LDS lowering for module LDS itself.
+  if (F && GV.getName() == "llvm.amdgcn.module.lds") {
+    return false;
+  }
+
   bool Ret = false;
   SmallPtrSet<const User *, 8> Visited;
   SmallVector<const User *, 16> Stack(GV.users());
+  SmallPtrSet<const GlobalVariable *, 8> GlobalUsers;
 
   assert(!F || isKernelCC(F));
 
   while (!Stack.empty()) {
-    const User *V = Stack.pop_back_val();
-    Visited.insert(V);
+    const User *U = Stack.pop_back_val();
 
-    if (auto *G = dyn_cast<GlobalValue>(V->stripPointerCasts())) {
-      if (UsedList.contains(G)) {
+    // Ignore already visited users.
+    if (!Visited.insert(U).second) {
+      continue;
+    }
+
+    if (auto *G = dyn_cast<GlobalVariable>(U)) {
+      // User of GV is an another global variable G.
+      StringRef GName = G->getName();
+      if (GName != "llvm.used" && GName != "llvm.compiler.used" &&
+          !UsedList.contains(G)) {
+        // GV is genuinely used in global scope.
+        if (F) {
+          // Kernel LDS lowering should not lower GV.
+          return false;
+        } else {
+          // Whether to module lower GV depends on the use of G itself. We need
+          // to explore the uses of G. Hence save G for now.
+          GlobalUsers.insert(G);
+          continue;
+        }
+      } else {
+        // The user G is compiler.used list or from compiler.used list. For
+        // kernel LDS lowering, we need to lower GV if it is used in kernel
+        // scope. For module LDS lowering, we need to explore the uses of G
+        // itself.
+        if (!F) {
+          GlobalUsers.insert(G);
+        }
         continue;
       }
     }
 
-    if (auto *I = dyn_cast<Instruction>(V)) {
-      const Function *UF = I->getFunction();
-      if (UF == F) {
-        // Used from this kernel, we want to put it into the structure.
-        Ret = true;
-      } else if (!F) {
-        Ret |= !isKernelCC(UF);
-      }
+    if (isa<Constant>(U)) {
+      // Recursively traverse through constant expressions.
+      append_range(Stack, U->users());
       continue;
     }
 
-    if (auto *E = dyn_cast<ConstantExpr>(V)) {
-      if (F) {
-        // Any use which does not end up an instruction disqualifies a
-        // variable to be put into a kernel's LDS structure because later
-        // we will need to replace only this kernel's uses for which we
-        // need to identify a using function.
-        if (!isUsedOnlyFromFunction(E, F))
-          return false;
+    // User must be an instruction from some function.
+    auto *UF = cast<Instruction>(U)->getFunction();
+    if (F) {
+      if (!isKernelCC(UF)) {
+        // The user instruction is from non-kernel function, means kernel LDS
+        // lowering should ignore lowering of GV.
+        return false;
       }
-      for (const User *U : E->users()) {
-        if (Visited.insert(U).second) {
-          Stack.push_back(U);
-        }
+      // The user instruction is from kernel function. But, whether to lower GV
+      // for kernel F depends on if GV is used within kernel F or not.
+      Ret |= (UF == F);
+    } else {
+      if (!isKernelCC(UF)) {
+        // The user instruction is from non-kernel function, means module LDS
+        // lowering should lower GV.
+        return true;
       }
-      continue;
+      // We cannot decide yet if module LDS lowering should lower GV or not.
+      // Keep exploring.
     }
+  }
 
-    // Unknown user, conservatively lower the variable.
-    // For module LDS conservatively means place it into the module LDS struct.
-    // For kernel LDS it means lower as a standalone variable.
-    return !F;
+  if (!Ret && !F) {
+    // We have not yet decided if module LDS lowering should lower GV or not.
+    // Explore all global users of GV, and check if atleast one of these global
+    // users appear as an use within an instruction (possibly nested use via
+    // constant expression), if so, then conservately lower LDS.
+    for (auto *UG : GlobalUsers) {
+      Ret |= hasUserInstruction(UG);
+    }
   }
 
   return Ret;
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -947,22 +947,22 @@
 ; CI:       ; %bb.0:
 ; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    s_mov_b32 m0, -1
-; CI-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
+; CI-NEXT:    ds_read2_b32 v[0:1], v0 offset0:1 offset1:4
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_constant_adjacent_offsets:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
+; GFX9-NEXT:    ds_read2_b32 v[0:1], v2 offset0:1 offset1:4
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX9-NEXT:    v_add_u32_e32 v0, v1, v0
 ; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
@@ -977,22 +977,22 @@
 ; CI:       ; %bb.0:
 ; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    s_mov_b32 m0, -1
-; CI-NEXT:    ds_read2_b32 v[0:1], v0 offset1:2
+; CI-NEXT:    ds_read2_b32 v[0:1], v0 offset0:2 offset1:4
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_constant_disjoint_offsets:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    ds_read2_b32 v[0:1], v2 offset1:2
+; GFX9-NEXT:    ds_read2_b32 v[0:1], v2 offset0:2 offset1:4
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX9-NEXT:    v_add_u32_e32 v0, v1, v0
 ; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
@@ -774,21 +774,17 @@
 define amdgpu_kernel void @store_constant_adjacent_offsets() {
 ; CI-LABEL: store_constant_adjacent_offsets:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_movk_i32 s0, 0x7b
-; CI-NEXT:    v_mov_b32_e32 v0, 0
-; CI-NEXT:    v_mov_b32_e32 v1, s0
-; CI-NEXT:    v_mov_b32_e32 v2, s0
+; CI-NEXT:    v_mov_b32_e32 v0, 0x7b
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_mov_b32 m0, -1
-; CI-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
+; CI-NEXT:    ds_write2_b32 v1, v0, v0 offset0:1 offset1:4
 ; CI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: store_constant_adjacent_offsets:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_movk_i32 s0, 0x7b
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    ds_write2_b32 v1, v0, v0 offset0:1 offset1:4
 ; GFX9-NEXT:    s_endpgm
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
@@ -801,14 +797,14 @@
 ; CI-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_mov_b32 m0, -1
-; CI-NEXT:    ds_write2_b32 v1, v0, v0 offset1:2
+; CI-NEXT:    ds_write2_b32 v1, v0, v0 offset0:2 offset1:4
 ; CI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: store_constant_disjoint_offsets:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    ds_write2_b32 v1, v0, v0 offset1:2
+; GFX9-NEXT:    ds_write2_b32 v1, v0, v0 offset0:2 offset1:4
 ; GFX9-NEXT:    s_endpgm
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll
@@ -1,16 +1,29 @@
 ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
 ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
 
-; CHECK: %llvm.amdgcn.kernel.k2.lds.t = type { i32 }
-; CHECK-NOT: %llvm.amdgcn.kernel.k4.lds.t
-
-@lds.1 = internal unnamed_addr addrspace(3) global [2 x i8] undef, align 1
-
-; Use constant from different kernels
 ;.
+; Kernel specific struct types.
+; CHECK: %llvm.amdgcn.kernel.k0.lds.t = type { [2 x i8] }
+; CHECK: %llvm.amdgcn.kernel.k1.lds.t = type { [2 x i8] }
+; CHECK: %llvm.amdgcn.kernel.k2.lds.t = type { i32 }
+; CHECK: %llvm.amdgcn.kernel.k3.lds.t = type { [32 x i8] }
+; CHECK: %llvm.amdgcn.kernel.k4.lds.t = type { [2 x i8] }
+;
+; FIXME: We are not yet handled the same constant (which uses LDS) appearing in two different kernels.
+;        Hence, @lds.1 is not completely got lowered even though corresponding struct instances are
+;        created for it.
 ; CHECK: @lds.1 = internal unnamed_addr addrspace(3) global [2 x i8] undef, align 1
+;
+; Kernel specific struct instances.
+; CHECK: @llvm.amdgcn.kernel.k0.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k0.lds.t undef, align 1
+; CHECK: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t undef, align 1
 ; CHECK: @llvm.amdgcn.kernel.k2.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k2.lds.t undef, align 4
+; CHECK: @llvm.amdgcn.kernel.k3.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k3.lds.t undef, align 1
+; CHECK: @llvm.amdgcn.kernel.k4.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k4.lds.t undef, align 1
 ;.
+
+@lds.1 = internal unnamed_addr addrspace(3) global [2 x i8] undef, align 1
+
 define amdgpu_kernel void @k0(i64 %x) {
 ; CHECK-LABEL: @k0(
 ; CHECK-NEXT:    %ptr = getelementptr inbounds i8, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(3)* @lds.1, i32 0, i32 0) to i8*), i64 %x
@@ -33,6 +46,7 @@
   ret void
 }
 
+; CHECK-NOT: @lds.2
 @lds.2 = internal unnamed_addr addrspace(3) global i32 undef, align 4
 
 ; Use constant twice from the same kernel
@@ -51,6 +65,7 @@
   ret void
 }
 
+; CHECK-NOT: @lds.3
 @lds.3 = internal unnamed_addr addrspace(3) global [32 x i8] undef, align 1
 
 ; Use constant twice from the same kernel but a different other constant.
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll
@@ -0,0 +1,88 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
+
+;.
+; @lds.1:  is part of @llvm.used list, hence it is not lowered.
+; @lds.2:  is part of @llvm.compiler.used list, hence it is not lowered.
+; @lds.3:  is used as initializer to @gptr.3, but @gptr.3 itself is not used anywhere,
+;          hence @lds.3 is not lowered.
+; @lds.4:  is used as initializer to @gptr.4, and @gptr.4 is part of @llvm.compiler.used list,
+;          and is no-where else used, hence @lds.4 is not lowered.
+;
+; @lds.5:  is used as initializer to @gptr.5, and @gptr.5 is part of @llvm.compiler.used list,
+;          but, @gptr.5 is also used within kernel @k0, hence @lds.5 is lowered.
+; @lds.6:  is used as initializer to @gptr.6, and @gptr.6 is part of @llvm.compiler.used list,
+;          but, @gptr.6 is also used within non-kernel function @f0, hence @lds.6 is lowered.
+; @lds.7:  is used as initializer to @gptr.7, and @gptr.7 used as initializer to @gptr.8, and
+;          @gptr.8 is used within non-kernel function @f1, hence @lds.7 is lowered.
+;.
+
+; CHECK: %llvm.amdgcn.module.lds.t = type { [3 x float], [4 x i8], [2 x float], [1 x float] }
+
+; CHECK: @lds.1 = addrspace(3) global i16 undef, align 2
+; CHECK: @lds.2 = addrspace(3) global i32 undef, align 4
+; CHECK: @lds.3 = addrspace(3) global i64 undef, align 8
+; CHECK: @lds.4 = addrspace(3) global float undef, align 4
+; CHECK-NOT: @lds.5
+; CHECK-NOT: @lds.6
+; CHECK-NOT: @lds.7
+@lds.1 = addrspace(3) global i16 undef, align 2
+@lds.2 = addrspace(3) global i32 undef, align 4
+@lds.3 = addrspace(3) global i64 undef, align 8
+@lds.4 = addrspace(3) global float undef, align 4
+@lds.5 = addrspace(3) global [1 x float] undef, align 4
+@lds.6 = addrspace(3) global [2 x float] undef, align 8
+@lds.7 = addrspace(3) global [3 x float] undef, align 16
+
+; CHECK: @gptr.3 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* @lds.3 to i64*), align 8
+; CHECK: @gptr.4 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast (float addrspace(3)* @lds.4 to i64 addrspace(3)*) to i64*), align 8
+; CHECK: @gptr.5 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast ([1 x float] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 3) to i64 addrspace(3)*) to i64*), align 8
+; CHECK: @gptr.6 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast ([2 x float] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2) to i64 addrspace(3)*) to i64*), align 8
+; CHECK: @gptr.7 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i64 addrspace(3)*) to i64*), align 8
+; CHECK: @gptr.8 = addrspace(1) global i64** addrspacecast (i64* addrspace(1)* @gptr.7 to i64**), align 8
+@gptr.3 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* @lds.3 to i64*), align 8
+@gptr.4 = addrspace(1) global i64* addrspacecast (float addrspace(3)* @lds.4 to i64*), align 8
+@gptr.5 = addrspace(1) global i64* addrspacecast ([1 x float] addrspace(3)* @lds.5 to i64*), align 8
+@gptr.6 = addrspace(1) global i64* addrspacecast ([2 x float] addrspace(3)* @lds.6 to i64*), align 8
+@gptr.7 = addrspace(1) global i64* addrspacecast ([3 x float] addrspace(3)* @lds.7 to i64*), align 8
+@gptr.8 = addrspace(1) global i64** addrspacecast (i64* addrspace(1)* @gptr.7 to i64**), align 8
+
+; CHECK: @llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i16 addrspace(3)* @lds.1 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
+; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 16
+; CHECK: @llvm.compiler.used = appending global [5 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @lds.2 to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.4 to i8 addrspace(1)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.5 to i8 addrspace(1)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.6 to i8 addrspace(1)*) to i8*), i8* addrspacecast (i8 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
+@llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i16 addrspace(3)* @lds.1 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
+@llvm.compiler.used = appending global [4 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @lds.2 to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.4 to i8 addrspace(1)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.5 to i8 addrspace(1)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.6 to i8 addrspace(1)*) to i8*)], section "llvm.metadata"
+
+; CHECK-LABEL: @f1()
+; CHECK:   %ld = load i64**, i64** addrspace(1)* @gptr.8, align 8
+; CHECK:   ret void
+define void @f1() {
+  %ld = load i64**, i64** addrspace(1)* @gptr.8
+  ret void
+}
+
+; CHECK-LABEL: @f0()
+; CHECK:   %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.6 to i32 addrspace(1)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32
+; CHECK: addrspace(1)* bitcast (i64* addrspace(1)* @gptr.6 to i32 addrspace(1)*) to i32*) to i64)) to i32*), align 4
+; CHECK:   ret void
+define void @f0() {
+  %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.6 to i32 addrspace(1)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.6 to i32 addrspace(1)*) to i32*) to i64)) to i32*), align 4
+  ret void
+}
+
+; CHECK-LABEL: @k0()
+; CHECK:   call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
+; CHECK:   %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.5 to i32 addrspace(1)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32
+; CHECK: addrspace(1)* bitcast (i64* addrspace(1)* @gptr.5 to i32 addrspace(1)*) to i32*) to i64)) to i32*), align 4
+; CHECK:   ret void
+define amdgpu_kernel void @k0() {
+  %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.5 to i32 addrspace(1)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.5 to i32 addrspace(1)*) to i32*) to i64)) to i32*), align 4
+  ret void
+}
+
+; CHECK-LABEL: @k1()
+; CHECK:   call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
+; CHECK:   ret void
+define amdgpu_kernel void @k1() {
+  ret void
+}