diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h @@ -23,6 +23,10 @@ Align getAlign(DataLayout const &DL, const GlobalVariable *GV); +/// \returns true if a given global variable \p GV (or its global users) appear +/// as an use within some instruction (either from kernel or from non-kernel). +bool hasUserInstruction(const GlobalVariable *GV); + /// \returns true if an LDS global requres lowering to a module LDS structure /// if \p F is not given. If \p F is given it must be a kernel and function /// \returns true if an LDS global is directly used from that kernel and it diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp @@ -42,60 +42,123 @@ return false; } +bool hasUserInstruction(const GlobalVariable *GV) { + SmallPtrSet Visited; + SmallVector Stack(GV->users()); + while (!Stack.empty()) { + const User *U = Stack.pop_back_val(); + + if (!Visited.insert(U).second) { + continue; + } + + if (isa(U) || isa(U)) { + append_range(Stack, U->users()); + continue; + } + + if (isa(U)) { + return true; + } + } + + return false; +} + bool shouldLowerLDSToStruct(const SmallPtrSetImpl &UsedList, const GlobalVariable &GV, const Function *F) { - // Any LDS variable can be lowered by moving into the created struct - // Each variable so lowered is allocated in every kernel, so variables - // whose users are all known to be safe to lower without the transform - // are left unchanged. + // We are not interested in kernel LDS lowering for module LDS itself. + if (F && GV.getName() == "llvm.amdgcn.module.lds") { + return false; + } + bool Ret = false; SmallPtrSet Visited; SmallVector Stack(GV.users()); + SmallPtrSet GlobalUsers; assert(!F || isKernelCC(F)); while (!Stack.empty()) { - const User *V = Stack.pop_back_val(); - Visited.insert(V); + const User *U = Stack.pop_back_val(); + + // Ignore already visited users. + if (!Visited.insert(U).second) { + continue; + } - if (auto *G = dyn_cast(V->stripPointerCasts())) { - if (UsedList.contains(G)) { + if (auto *G = dyn_cast(U)) { + // User of GV is an another global variable G. + StringRef GName = G->getName(); + if (GName != "llvm.used" && GName != "llvm.compiler.used" && + !UsedList.contains(G)) { + // GV is genuinely used in global scope as an initializer to another + // global G. + if (F) { + // Kernel LDS lowering should not lower GV which is genuinely used in + // global scope. + return false; + } + // Whether to module lower GV depends on the use of G itself. We need + // to explore the uses of G. Hence save G for now. + GlobalUsers.insert(G); + continue; + } else { + // The user G is compiler.used list or from compiler.used list. Whether + // to lower GV or not depends on the uses of G itself. + if (F) { + if (UsedList.contains(G)) { + // G is from compiler used list, and is difficult to track the uses + // of G for kernel LDS lowering. Hence kernel LDS lowering should be + // ignored for GV. + return false; + } + // G is compiler.used list, and we can safely kerenl LDS lower GV if + // it used within kernel. + } else { + // For module LDS lowering, we need to explore the uses of G itself. + GlobalUsers.insert(G); + } continue; } } - if (auto *I = dyn_cast(V)) { - const Function *UF = I->getFunction(); - if (UF == F) { - // Used from this kernel, we want to put it into the structure. - Ret = true; - } else if (!F) { - Ret |= !isKernelCC(UF); - } + if (isa(U)) { + // Recursively traverse through constant expressions. + append_range(Stack, U->users()); continue; } - if (auto *E = dyn_cast(V)) { - if (F) { - // Any use which does not end up an instruction disqualifies a - // variable to be put into a kernel's LDS structure because later - // we will need to replace only this kernel's uses for which we - // need to identify a using function. - if (!isUsedOnlyFromFunction(E, F)) - return false; + // User must be an instruction from some function. + auto *UF = cast(U)->getFunction(); + if (F) { + if (!isKernelCC(UF)) { + // The user instruction is from non-kernel function, means kernel LDS + // lowering should ignore lowering of GV. + return false; } - for (const User *U : E->users()) { - if (Visited.insert(U).second) { - Stack.push_back(U); - } + // The user instruction is from kernel function. But, whether to lower GV + // for kernel F depends on if GV is used within kernel F or not. + Ret |= (UF == F); + } else { + if (!isKernelCC(UF)) { + // The user instruction is from non-kernel function, means module LDS + // lowering should lower GV. + return true; } - continue; + // We cannot decide yet if module LDS lowering should lower GV or not. + // Keep exploring. } + } - // Unknown user, conservatively lower the variable. - // For module LDS conservatively means place it into the module LDS struct. - // For kernel LDS it means lower as a standalone variable. - return !F; + if (!Ret && !F) { + // We have not yet decided if module LDS lowering should lower GV or not. + // Explore all global users of GV, and check if atleast one of these global + // users appear as an use within an instruction (possibly nested use via + // constant expression), if so, then conservately lower LDS. + for (auto *UG : GlobalUsers) { + Ret |= hasUserInstruction(UG); + } } return Ret; diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -947,22 +947,22 @@ ; CI: ; %bb.0: ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; CI-NEXT: ds_read2_b32 v[0:1], v0 offset0:1 offset1:4 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: load_constant_adjacent_offsets: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 +; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset0:1 offset1:4 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u32_e32 v0, v1, v0 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 @@ -977,22 +977,22 @@ ; CI: ; %bb.0: ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:2 +; CI-NEXT: ds_read2_b32 v[0:1], v0 offset0:2 offset1:4 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: load_constant_disjoint_offsets: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:2 +; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset0:2 offset1:4 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u32_e32 v0, v1, v0 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -774,21 +774,17 @@ define amdgpu_kernel void @store_constant_adjacent_offsets() { ; CI-LABEL: store_constant_adjacent_offsets: ; CI: ; %bb.0: -; CI-NEXT: s_movk_i32 s0, 0x7b -; CI-NEXT: v_mov_b32_e32 v0, 0 -; CI-NEXT: v_mov_b32_e32 v1, s0 -; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_mov_b32_e32 v0, 0x7b +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 +; CI-NEXT: ds_write2_b32 v1, v0, v0 offset0:1 offset1:4 ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: store_constant_adjacent_offsets: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_movk_i32 s0, 0x7b -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: ds_write2_b32 v1, v0, v0 offset0:1 offset1:4 ; GFX9-NEXT: s_endpgm store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 @@ -801,14 +797,14 @@ ; CI-NEXT: v_mov_b32_e32 v0, 0x7b ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: ds_write2_b32 v1, v0, v0 offset1:2 +; CI-NEXT: ds_write2_b32 v1, v0, v0 offset0:2 offset1:4 ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: store_constant_disjoint_offsets: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: ds_write2_b32 v1, v0, v0 offset1:2 +; GFX9-NEXT: ds_write2_b32 v1, v0, v0 offset0:2 offset1:4 ; GFX9-NEXT: s_endpgm store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4 diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll --- a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll @@ -1,16 +1,29 @@ ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s -; CHECK: %llvm.amdgcn.kernel.k2.lds.t = type { i32 } -; CHECK-NOT: %llvm.amdgcn.kernel.k4.lds.t - -@lds.1 = internal unnamed_addr addrspace(3) global [2 x i8] undef, align 1 - -; Use constant from different kernels ;. +; Kernel specific struct types. +; CHECK: %llvm.amdgcn.kernel.k0.lds.t = type { [2 x i8] } +; CHECK: %llvm.amdgcn.kernel.k1.lds.t = type { [2 x i8] } +; CHECK: %llvm.amdgcn.kernel.k2.lds.t = type { i32 } +; CHECK: %llvm.amdgcn.kernel.k3.lds.t = type { [32 x i8] } +; CHECK: %llvm.amdgcn.kernel.k4.lds.t = type { [2 x i8] } +; +; FIXME: We are not yet handled the same constant (which uses LDS) appearing in two different kernels. +; Hence, @lds.1 is not completely got lowered even though corresponding struct instances are +; created for it. ; CHECK: @lds.1 = internal unnamed_addr addrspace(3) global [2 x i8] undef, align 1 +; +; Kernel specific struct instances. +; CHECK: @llvm.amdgcn.kernel.k0.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k0.lds.t undef, align 1 +; CHECK: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t undef, align 1 ; CHECK: @llvm.amdgcn.kernel.k2.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k2.lds.t undef, align 4 +; CHECK: @llvm.amdgcn.kernel.k3.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k3.lds.t undef, align 1 +; CHECK: @llvm.amdgcn.kernel.k4.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k4.lds.t undef, align 1 ;. + +@lds.1 = internal unnamed_addr addrspace(3) global [2 x i8] undef, align 1 + define amdgpu_kernel void @k0(i64 %x) { ; CHECK-LABEL: @k0( ; CHECK-NEXT: %ptr = getelementptr inbounds i8, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(3)* @lds.1, i32 0, i32 0) to i8*), i64 %x @@ -33,6 +46,7 @@ ret void } +; CHECK-NOT: @lds.2 @lds.2 = internal unnamed_addr addrspace(3) global i32 undef, align 4 ; Use constant twice from the same kernel @@ -51,6 +65,7 @@ ret void } +; CHECK-NOT: @lds.3 @lds.3 = internal unnamed_addr addrspace(3) global [32 x i8] undef, align 1 ; Use constant twice from the same kernel but a different other constant. diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll @@ -0,0 +1,88 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s + +;. +; @lds.1: is part of @llvm.used list, hence it is not lowered. +; @lds.2: is part of @llvm.compiler.used list, hence it is not lowered. +; @lds.3: is used as initializer to @gptr.3, but @gptr.3 itself is not used anywhere, +; hence @lds.3 is not lowered. +; @lds.4: is used as initializer to @gptr.4, and @gptr.4 is part of @llvm.compiler.used list, +; and is no-where else used, hence @lds.4 is not lowered. +; +; @lds.5: is used as initializer to @gptr.5, and @gptr.5 is part of @llvm.compiler.used list, +; but, @gptr.5 is also used within kernel @k0, hence @lds.5 is lowered. +; @lds.6: is used as initializer to @gptr.6, and @gptr.6 is part of @llvm.compiler.used list, +; but, @gptr.6 is also used within non-kernel function @f0, hence @lds.6 is lowered. +; @lds.7: is used as initializer to @gptr.7, and @gptr.7 used as initializer to @gptr.8, and +; @gptr.8 is used within non-kernel function @f1, hence @lds.7 is lowered. +;. + +; CHECK: %llvm.amdgcn.module.lds.t = type { [3 x float], [4 x i8], [2 x float], [1 x float] } + +; CHECK: @lds.1 = addrspace(3) global i16 undef, align 2 +; CHECK: @lds.2 = addrspace(3) global i32 undef, align 4 +; CHECK: @lds.3 = addrspace(3) global i64 undef, align 8 +; CHECK: @lds.4 = addrspace(3) global float undef, align 4 +; CHECK-NOT: @lds.5 +; CHECK-NOT: @lds.6 +; CHECK-NOT: @lds.7 +@lds.1 = addrspace(3) global i16 undef, align 2 +@lds.2 = addrspace(3) global i32 undef, align 4 +@lds.3 = addrspace(3) global i64 undef, align 8 +@lds.4 = addrspace(3) global float undef, align 4 +@lds.5 = addrspace(3) global [1 x float] undef, align 4 +@lds.6 = addrspace(3) global [2 x float] undef, align 8 +@lds.7 = addrspace(3) global [3 x float] undef, align 16 + +; CHECK: @gptr.3 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* @lds.3 to i64*), align 8 +; CHECK: @gptr.4 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast (float addrspace(3)* @lds.4 to i64 addrspace(3)*) to i64*), align 8 +; CHECK: @gptr.5 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast ([1 x float] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 3) to i64 addrspace(3)*) to i64*), align 8 +; CHECK: @gptr.6 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast ([2 x float] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2) to i64 addrspace(3)*) to i64*), align 8 +; CHECK: @gptr.7 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i64 addrspace(3)*) to i64*), align 8 +; CHECK: @gptr.8 = addrspace(1) global i64** addrspacecast (i64* addrspace(1)* @gptr.7 to i64**), align 8 +@gptr.3 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* @lds.3 to i64*), align 8 +@gptr.4 = addrspace(1) global i64* addrspacecast (float addrspace(3)* @lds.4 to i64*), align 8 +@gptr.5 = addrspace(1) global i64* addrspacecast ([1 x float] addrspace(3)* @lds.5 to i64*), align 8 +@gptr.6 = addrspace(1) global i64* addrspacecast ([2 x float] addrspace(3)* @lds.6 to i64*), align 8 +@gptr.7 = addrspace(1) global i64* addrspacecast ([3 x float] addrspace(3)* @lds.7 to i64*), align 8 +@gptr.8 = addrspace(1) global i64** addrspacecast (i64* addrspace(1)* @gptr.7 to i64**), align 8 + +; CHECK: @llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i16 addrspace(3)* @lds.1 to i8 addrspace(3)*) to i8*)], section "llvm.metadata" +; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 16 +; CHECK: @llvm.compiler.used = appending global [5 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @lds.2 to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.4 to i8 addrspace(1)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.5 to i8 addrspace(1)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.6 to i8 addrspace(1)*) to i8*), i8* addrspacecast (i8 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i8 addrspace(3)*) to i8*)], section "llvm.metadata" +@llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i16 addrspace(3)* @lds.1 to i8 addrspace(3)*) to i8*)], section "llvm.metadata" +@llvm.compiler.used = appending global [4 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @lds.2 to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.4 to i8 addrspace(1)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.5 to i8 addrspace(1)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.6 to i8 addrspace(1)*) to i8*)], section "llvm.metadata" + +; CHECK-LABEL: @f1() +; CHECK: %ld = load i64**, i64** addrspace(1)* @gptr.8, align 8 +; CHECK: ret void +define void @f1() { + %ld = load i64**, i64** addrspace(1)* @gptr.8 + ret void +} + +; CHECK-LABEL: @f0() +; CHECK: %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.6 to i32 addrspace(1)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 +; CHECK: addrspace(1)* bitcast (i64* addrspace(1)* @gptr.6 to i32 addrspace(1)*) to i32*) to i64)) to i32*), align 4 +; CHECK: ret void +define void @f0() { + %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.6 to i32 addrspace(1)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.6 to i32 addrspace(1)*) to i32*) to i64)) to i32*), align 4 + ret void +} + +; CHECK-LABEL: @k0() +; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +; CHECK: %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.5 to i32 addrspace(1)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 +; CHECK: addrspace(1)* bitcast (i64* addrspace(1)* @gptr.5 to i32 addrspace(1)*) to i32*) to i64)) to i32*), align 4 +; CHECK: ret void +define amdgpu_kernel void @k0() { + %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.5 to i32 addrspace(1)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.5 to i32 addrspace(1)*) to i32*) to i64)) to i32*), align 4 + ret void +} + +; CHECK-LABEL: @k1() +; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +; CHECK: ret void +define amdgpu_kernel void @k1() { + ret void +}