diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -171,7 +171,7 @@ // Find variables to move into new struct instance std::vector FoundLocalVars = - AMDGPU::findVariablesToLower(M, UsedList, F); + AMDGPU::findVariablesToLower(M, F); if (FoundLocalVars.empty()) { // No variables to rewrite, no changes made. diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h @@ -25,19 +25,19 @@ Align getAlign(DataLayout const &DL, const GlobalVariable *GV); +/// \returns true if a given global variable \p GV (or its global users) appear +/// as an use within some instruction (either from kernel or from non-kernel). +bool hasUserInstruction(const GlobalVariable *GV); + /// \returns true if an LDS global requres lowering to a module LDS structure /// if \p F is not given. If \p F is given it must be a kernel and function /// \returns true if an LDS global is directly used from that kernel and it /// is safe to replace its uses with a kernel LDS structure member. -/// \p UsedList contains a union of llvm.used and llvm.compiler.used variables -/// which do not count as a use. -bool shouldLowerLDSToStruct(const SmallPtrSetImpl &UsedList, - const GlobalVariable &GV, +bool shouldLowerLDSToStruct(const GlobalVariable &GV, const Function *F = nullptr); -std::vector -findVariablesToLower(Module &M, const SmallPtrSetImpl &UsedList, - const Function *F = nullptr); +std::vector findVariablesToLower(Module &M, + const Function *F = nullptr); SmallPtrSet getUsedList(Module &M); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp @@ -60,15 +60,35 @@ } } -bool shouldLowerLDSToStruct(const SmallPtrSetImpl &UsedList, - const GlobalVariable &GV, const Function *F) { - // Any LDS variable can be lowered by moving into the created struct - // Each variable so lowered is allocated in every kernel, so variables - // whose users are all known to be safe to lower without the transform - // are left unchanged. +bool hasUserInstruction(const GlobalVariable *GV) { + SmallPtrSet Visited; + SmallVector Stack(GV->users()); + + while (!Stack.empty()) { + const User *U = Stack.pop_back_val(); + + if (!Visited.insert(U).second) + continue; + + if (isa(U)) + return true; + + auto *C = cast(U); + append_range(Stack, C->users()); + } + + return false; +} + +bool shouldLowerLDSToStruct(const GlobalVariable &GV, const Function *F) { + // We are not interested in kernel LDS lowering for module LDS itself. + if (F && GV.getName() == "llvm.amdgcn.module.lds") + return false; + bool Ret = false; SmallPtrSet Visited; SmallVector Stack(GV.users()); + SmallPtrSet GlobalUsers; assert(!F || isKernelCC(F)); @@ -76,10 +96,16 @@ const User *V = Stack.pop_back_val(); Visited.insert(V); - if (auto *G = dyn_cast(V->stripPointerCasts())) { - if (UsedList.contains(G)) { - continue; + if (auto *G = dyn_cast(V)) { + StringRef GName = G->getName(); + if (F && GName != "llvm.used" && GName != "llvm.compiler.used") { + // For kernel LDS lowering, if G is not a compiler.used list, then we + // cannot lower the lds GV since we cannot replace the use of GV within + // G. + return false; } + GlobalUsers.insert(G); + continue; } if (auto *I = dyn_cast(V)) { @@ -88,32 +114,32 @@ // Used from this kernel, we want to put it into the structure. Ret = true; } else if (!F) { + // For module LDS lowering, lowering is required if the user instruction + // is from non-kernel function. Ret |= !isKernelCC(UF); } continue; } - if (auto *E = dyn_cast(V)) { - for (const User *U : E->users()) { - if (Visited.insert(U).second) { - Stack.push_back(U); - } - } - continue; - } + // User V should be a constant, recursively visit users of V. + auto *E = cast(V); + append_range(Stack, E->users()); + } - // Unknown user, conservatively lower the variable. - // For module LDS conservatively means place it into the module LDS struct. - // For kernel LDS it means lower as a standalone variable. - return !F; + if (!F && !Ret) { + // For module LDS lowering, we have not yet decided if we should lower GV or + // not. Explore all global users of GV, and check if atleast one of these + // global users appear as an use within an instruction (possibly nested use + // via constant expression), if so, then conservately lower LDS. + for (auto *G : GlobalUsers) + Ret |= hasUserInstruction(G); } return Ret; } -std::vector -findVariablesToLower(Module &M, const SmallPtrSetImpl &UsedList, - const Function *F) { +std::vector findVariablesToLower(Module &M, + const Function *F) { std::vector LocalVars; for (auto &GV : M.globals()) { if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { @@ -137,7 +163,7 @@ // dropped by the back end if not. This pass skips over it. continue; } - if (!shouldLowerLDSToStruct(UsedList, GV, F)) { + if (!shouldLowerLDSToStruct(GV, F)) { continue; } LocalVars.push_back(&GV); diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-global-uses.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-global-uses.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-global-uses.ll @@ -0,0 +1,55 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s + +;. +; Kernel LDS lowering. +;. +; @lds.1: is part of @llvm.used list, and also it is used within kernel, hence it is lowered. +; @lds.2: is part of @llvm.compiler.used list, and also it is used within kernel, hence it is lowered. +; @lds.3: is used as initializer to @gptr.3, hence @lds.3 is not lowered, though it is used within kernel. +; @lds.4: is used as initializer to @gptr.4, hence @lds.4 is not lowered, though it is used within kernel, +; irrespective of the uses of @gptr.4 itself ( @gptr.4 is part of llvm.compiler.used list ). +; @lds.5: is part of @llvm.used list, but is not used within kernel, hence it is not lowered. +; @lds.6: is part of @llvm.compiler.used list, but is not used within kernel, hence it is not lowered. +;. + +; CHECK: %llvm.amdgcn.kernel.k0.lds.t = type { i32, i16 } + +; CHECK-NOT: @lds.1 +; CHECK-NOT: @lds.2 +; CHECK: @lds.3 = addrspace(3) global i64 undef, align 8 +; CHECK: @lds.4 = addrspace(3) global float undef, align 4 +; CHECK: @lds.5 = addrspace(3) global i16 undef, align 2 +; CHECK: @lds.6 = addrspace(3) global i32 undef, align 4 +@lds.1 = addrspace(3) global i16 undef, align 2 +@lds.2 = addrspace(3) global i32 undef, align 4 +@lds.3 = addrspace(3) global i64 undef, align 8 +@lds.4 = addrspace(3) global float undef, align 4 +@lds.5 = addrspace(3) global i16 undef, align 2 +@lds.6 = addrspace(3) global i32 undef, align 4 + +; CHECK: @gptr.3 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* @lds.3 to i64*), align 8 +; CHECK: @gptr.4 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast (float addrspace(3)* @lds.4 to i64 addrspace(3)*) to i64*), align 8 +@gptr.3 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* @lds.3 to i64*), align 8 +@gptr.4 = addrspace(1) global i64* addrspacecast (float addrspace(3)* @lds.4 to i64*), align 8 + +; CHECK: @llvm.amdgcn.kernel.k0.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k0.lds.t undef, align 4 + +; CHECK: @llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i16 addrspace(3)* @lds.5 to i8 addrspace(3)*) to i8*)], section "llvm.metadata" +; CHECK: @llvm.compiler.used = appending global [2 x i8*] [i8* addrspacecast (i8 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.4 to i8 addrspace(1)*) to i8*), i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @lds.6 to i8 addrspace(3)*) to i8*)], section "llvm.metadata" +@llvm.used = appending global [2 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i16 addrspace(3)* @lds.1 to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(3)* bitcast (i16 addrspace(3)* @lds.5 to i8 addrspace(3)*) to i8*)], section "llvm.metadata" +@llvm.compiler.used = appending global [3 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @lds.2 to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.4 to i8 addrspace(1)*) to i8*), i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @lds.6 to i8 addrspace(3)*) to i8*)], section "llvm.metadata" + +; CHECK-LABEL: @k0() +; CHECK: %ld.lds.1 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1), align 2 +; CHECK: %ld.lds.2 = load i32, i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0), align 4 +; CHECK: %ld.lds.3 = load i64, i64 addrspace(3)* @lds.3, align 4 +; CHECK: %ld.lds.4 = load float, float addrspace(3)* @lds.4, align 4 +; CHECK: ret void +define amdgpu_kernel void @k0() { + %ld.lds.1 = load i16, i16 addrspace(3)* @lds.1 + %ld.lds.2 = load i32, i32 addrspace(3)* @lds.2 + %ld.lds.3 = load i64, i64 addrspace(3)* @lds.3 + %ld.lds.4 = load float, float addrspace(3)* @lds.4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll @@ -0,0 +1,88 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s + +;. +; @lds.1: is part of @llvm.used list, and is no-where used. Hence it is not lowered. +; @lds.2: is part of @llvm.compiler.used list, and is no-where used. Hence it is not lowered. +; @lds.3: is used as initializer to @gptr.3, and is no-where used. @gptr.3 itself is also not +; used anywhere else, hence @lds.3 is not lowered. +; @lds.4: is used as initializer to @gptr.4, and is no-where used. @gptr.4 is part of +; @llvm.compiler.used list, but is no-where else used. hence @lds.4 is not lowered. +; +; @lds.5: is used as initializer to @gptr.5, and is no-where used. @gptr.5 is part of +; @llvm.compiler.used list, but is also used within kernel @k0. Hence @lds.5 is lowered. +; @lds.6: is used as initializer to @gptr.6, and is no-where used. @gptr.6 is part of +; @llvm.compiler.used list, but is also used within non-kernel function @f0. Hence @lds.6 is lowered. +; @lds.7: is used as initializer to @gptr.7, and is no-where used. @gptr.7 is used as initializer to @gptr.8, +; and @gptr.8 is used within non-kernel function @f1. Hence @lds.7 is lowered. +;. + +; CHECK: %llvm.amdgcn.module.lds.t = type { [3 x float], [4 x i8], [2 x float], [1 x float] } + +; CHECK: @lds.1 = addrspace(3) global i16 undef, align 2 +; CHECK: @lds.2 = addrspace(3) global i32 undef, align 4 +; CHECK: @lds.3 = addrspace(3) global i64 undef, align 8 +; CHECK: @lds.4 = addrspace(3) global float undef, align 4 +; CHECK-NOT: @lds.5 +; CHECK-NOT: @lds.6 +; CHECK-NOT: @lds.7 +@lds.1 = addrspace(3) global i16 undef, align 2 +@lds.2 = addrspace(3) global i32 undef, align 4 +@lds.3 = addrspace(3) global i64 undef, align 8 +@lds.4 = addrspace(3) global float undef, align 4 +@lds.5 = addrspace(3) global [1 x float] undef, align 4 +@lds.6 = addrspace(3) global [2 x float] undef, align 8 +@lds.7 = addrspace(3) global [3 x float] undef, align 16 + +; CHECK: @gptr.3 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* @lds.3 to i64*), align 8 +; CHECK: @gptr.4 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast (float addrspace(3)* @lds.4 to i64 addrspace(3)*) to i64*), align 8 +; CHECK: @gptr.5 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast ([1 x float] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 3) to i64 addrspace(3)*) to i64*), align 8 +; CHECK: @gptr.6 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast ([2 x float] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2) to i64 addrspace(3)*) to i64*), align 8 +; CHECK: @gptr.7 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i64 addrspace(3)*) to i64*), align 8 +; CHECK: @gptr.8 = addrspace(1) global i64** addrspacecast (i64* addrspace(1)* @gptr.7 to i64**), align 8 +@gptr.3 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* @lds.3 to i64*), align 8 +@gptr.4 = addrspace(1) global i64* addrspacecast (float addrspace(3)* @lds.4 to i64*), align 8 +@gptr.5 = addrspace(1) global i64* addrspacecast ([1 x float] addrspace(3)* @lds.5 to i64*), align 8 +@gptr.6 = addrspace(1) global i64* addrspacecast ([2 x float] addrspace(3)* @lds.6 to i64*), align 8 +@gptr.7 = addrspace(1) global i64* addrspacecast ([3 x float] addrspace(3)* @lds.7 to i64*), align 8 +@gptr.8 = addrspace(1) global i64** addrspacecast (i64* addrspace(1)* @gptr.7 to i64**), align 8 + +; CHECK: @llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i16 addrspace(3)* @lds.1 to i8 addrspace(3)*) to i8*)], section "llvm.metadata" +; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 16 +; CHECK: @llvm.compiler.used = appending global [5 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @lds.2 to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.4 to i8 addrspace(1)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.5 to i8 addrspace(1)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.6 to i8 addrspace(1)*) to i8*), i8* addrspacecast (i8 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i8 addrspace(3)*) to i8*)], section "llvm.metadata" +@llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i16 addrspace(3)* @lds.1 to i8 addrspace(3)*) to i8*)], section "llvm.metadata" +@llvm.compiler.used = appending global [4 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @lds.2 to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.4 to i8 addrspace(1)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.5 to i8 addrspace(1)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.6 to i8 addrspace(1)*) to i8*)], section "llvm.metadata" + +; CHECK-LABEL: @f1() +; CHECK: %ld = load i64**, i64** addrspace(1)* @gptr.8, align 8 +; CHECK: ret void +define void @f1() { + %ld = load i64**, i64** addrspace(1)* @gptr.8 + ret void +} + +; CHECK-LABEL: @f0() +; CHECK: %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.6 to i32 addrspace(1)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 +; CHECK: addrspace(1)* bitcast (i64* addrspace(1)* @gptr.6 to i32 addrspace(1)*) to i32*) to i64)) to i32*), align 4 +; CHECK: ret void +define void @f0() { + %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.6 to i32 addrspace(1)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.6 to i32 addrspace(1)*) to i32*) to i64)) to i32*), align 4 + ret void +} + +; CHECK-LABEL: @k0() +; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +; CHECK: %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.5 to i32 addrspace(1)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 +; CHECK: addrspace(1)* bitcast (i64* addrspace(1)* @gptr.5 to i32 addrspace(1)*) to i32*) to i64)) to i32*), align 4 +; CHECK: ret void +define amdgpu_kernel void @k0() { + %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.5 to i32 addrspace(1)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.5 to i32 addrspace(1)*) to i32*) to i64)) to i32*), align 4 + ret void +} + +; CHECK-LABEL: @k1() +; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +; CHECK: ret void +define amdgpu_kernel void @k1() { + ret void +}