diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -292,8 +292,18 @@ Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)}; Constant *GEP = ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx); if (F) { + // Replace all constant uses with instructions if they belong to the + // current kernel. + for (User *U : make_early_inc_range(GV->users())) { + if (ConstantExpr *C = dyn_cast(U)) + AMDGPU::replaceConstantUsesInFunction(C, F); + } + + GV->removeDeadConstantUsers(); + GV->replaceUsesWithIf(GEP, [F](Use &U) { - return AMDGPU::isUsedOnlyFromFunction(U.getUser(), F); + Instruction *I = dyn_cast(U.getUser()); + return I && I->getFunction() == F; }); } else { GV->replaceAllUsesWith(GEP); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h @@ -17,6 +17,8 @@ namespace llvm { +class ConstantExpr; + namespace AMDGPU { bool isKernelCC(const Function *Func); @@ -39,9 +41,8 @@ SmallPtrSet getUsedList(Module &M); -/// \returns true if all uses of \p U end up in a function \p F. -bool isUsedOnlyFromFunction(const User *U, const Function *F); - +/// Replace all uses of constant \p C with instructions in \p F. +void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F); } // end namespace AMDGPU } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp @@ -12,7 +12,9 @@ #include "AMDGPULDSUtils.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/SetVector.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/ReplaceConstant.h" using namespace llvm; @@ -29,17 +31,33 @@ GV->getValueType()); } -bool isUsedOnlyFromFunction(const User *U, const Function *F) { - if (auto *I = dyn_cast(U)) { - return I->getFunction() == F; - } +static void collectFunctionUses(User *U, const Function *F, + SetVector &InstUsers) { + SmallVector Stack{U}; + + while (!Stack.empty()) { + U = Stack.pop_back_val(); + + if (auto *I = dyn_cast(U)) { + if (I->getFunction() == F) + InstUsers.insert(I); + continue; + } - if (isa(U)) { - return all_of(U->users(), - [F](const User *U) { return isUsedOnlyFromFunction(U, F); }); + if (!isa(U)) + continue; + + append_range(Stack, U->users()); } +} - return false; +void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F) { + SetVector InstUsers; + + collectFunctionUses(C, F, InstUsers); + for (Instruction *I : InstUsers) { + convertConstantExprsToInstructions(I, C); + } } bool shouldLowerLDSToStruct(const SmallPtrSetImpl &UsedList, @@ -76,14 +94,6 @@ } if (auto *E = dyn_cast(V)) { - if (F) { - // Any use which does not end up an instruction disqualifies a - // variable to be put into a kernel's LDS structure because later - // we will need to replace only this kernel's uses for which we - // need to identify a using function. - if (!isUsedOnlyFromFunction(E, F)) - return false; - } for (const User *U : E->users()) { if (Visited.insert(U).second) { Stack.push_back(U); diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -947,7 +947,7 @@ ; CI: ; %bb.0: ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; CI-NEXT: ds_read_b64 v[0:1], v0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 @@ -959,7 +959,7 @@ ; GFX9-LABEL: load_constant_adjacent_offsets: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 +; GFX9-NEXT: ds_read_b64 v[0:1], v2 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -774,21 +774,19 @@ define amdgpu_kernel void @store_constant_adjacent_offsets() { ; CI-LABEL: store_constant_adjacent_offsets: ; CI: ; %bb.0: -; CI-NEXT: s_movk_i32 s0, 0x7b -; CI-NEXT: v_mov_b32_e32 v0, 0 -; CI-NEXT: v_mov_b32_e32 v1, s0 -; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_mov_b32_e32 v0, 0x7b +; CI-NEXT: v_mov_b32_e32 v1, v0 +; CI-NEXT: v_mov_b32_e32 v2, 0 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 +; CI-NEXT: ds_write_b64 v2, v[0:1] ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: store_constant_adjacent_offsets: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_movk_i32 s0, 0x7b -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: ds_write_b64 v2, v[0:1] ; GFX9-NEXT: s_endpgm store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll --- a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll @@ -1,19 +1,28 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --check-globals ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s -; CHECK: %llvm.amdgcn.kernel.k2.lds.t = type { i32 } -; CHECK-NOT: %llvm.amdgcn.kernel.k4.lds.t - @lds.1 = internal unnamed_addr addrspace(3) global [2 x i8] undef, align 1 +; CHECK: %llvm.amdgcn.kernel.k0.lds.t = type { [2 x i8] } +; CHECK: %llvm.amdgcn.kernel.k1.lds.t = type { [2 x i8] } +; CHECK: %llvm.amdgcn.kernel.k2.lds.t = type { i32 } +; CHECK: %llvm.amdgcn.kernel.k3.lds.t = type { [32 x i8] } +; CHECK: %llvm.amdgcn.kernel.k4.lds.t = type { [2 x i8] } + ; Use constant from different kernels ;. -; CHECK: @lds.1 = internal unnamed_addr addrspace(3) global [2 x i8] undef, align 1 +; CHECK: @llvm.amdgcn.kernel.k0.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k0.lds.t undef, align 2 +; CHECK: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t undef, align 2 ; CHECK: @llvm.amdgcn.kernel.k2.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k2.lds.t undef, align 4 +; CHECK: @llvm.amdgcn.kernel.k3.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k3.lds.t undef, align 16 +; CHECK: @llvm.amdgcn.kernel.k4.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k4.lds.t undef, align 2 ;. define amdgpu_kernel void @k0(i64 %x) { ; CHECK-LABEL: @k0( -; CHECK-NEXT: %ptr = getelementptr inbounds i8, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(3)* @lds.1, i32 0, i32 0) to i8*), i64 %x +; CHECK-NEXT: %1 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0), i32 0, i32 0 +; CHECK-NEXT: %2 = addrspacecast i8 addrspace(3)* %1 to i8* +; CHECK-NEXT: %ptr = getelementptr inbounds i8, i8* %2, i64 %x ; CHECK-NEXT: store i8 1, i8* %ptr, align 1 ; CHECK-NEXT: ret void ; @@ -24,7 +33,9 @@ define amdgpu_kernel void @k1(i64 %x) { ; CHECK-LABEL: @k1( -; CHECK-NEXT: %ptr = getelementptr inbounds i8, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(3)* @lds.1, i32 0, i32 0) to i8*), i64 %x +; CHECK-NEXT: %1 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0), i32 0, i32 0 +; CHECK-NEXT: %2 = addrspacecast i8 addrspace(3)* %1 to i8* +; CHECK-NEXT: %ptr = getelementptr inbounds i8, i8* %2, i64 %x ; CHECK-NEXT: store i8 1, i8* %ptr, align 1 ; CHECK-NEXT: ret void ; @@ -56,10 +67,15 @@ ; Use constant twice from the same kernel but a different other constant. define amdgpu_kernel void @k3(i64 %x) { ; CHECK-LABEL: @k3( -; CHECK-NEXT: %ptr1 = addrspacecast i64 addrspace(3)* bitcast (i8 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k3.lds.t, %llvm.amdgcn.kernel.k3.lds.t addrspace(3)* @llvm.amdgcn.kernel.k3.lds, i32 0, i32 0, i32 16) to i64 addrspace(3)*) to i64* +; CHECK-NEXT: %1 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k3.lds.t, %llvm.amdgcn.kernel.k3.lds.t addrspace(3)* @llvm.amdgcn.kernel.k3.lds, i32 0, i32 0), i32 0, i32 16 +; CHECK-NEXT: %2 = bitcast i8 addrspace(3)* %1 to i64 addrspace(3)* +; CHECK-NEXT: %ptr1 = addrspacecast i64 addrspace(3)* %2 to i64* ; CHECK-NEXT: store i64 1, i64* %ptr1, align 1 -; CHECK-NEXT: %ptr2 = addrspacecast i64 addrspace(3)* bitcast (i8 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k3.lds.t, %llvm.amdgcn.kernel.k3.lds.t addrspace(3)* @llvm.amdgcn.kernel.k3.lds, i32 0, i32 0, i32 24) to i64 addrspace(3)*) to i64* +; CHECK-NEXT: %3 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k3.lds.t, %llvm.amdgcn.kernel.k3.lds.t addrspace(3)* @llvm.amdgcn.kernel.k3.lds, i32 0, i32 0), i32 0, i32 24 +; CHECK-NEXT: %4 = bitcast i8 addrspace(3)* %3 to i64 addrspace(3)* +; CHECK-NEXT: %ptr2 = addrspacecast i64 addrspace(3)* %4 to i64* ; CHECK-NEXT: store i64 2, i64* %ptr2, align 1 +; CHECK-NEXT: ret void ; %ptr1 = addrspacecast i64 addrspace(3)* bitcast (i8 addrspace(3)* getelementptr inbounds ([32 x i8], [32 x i8] addrspace(3)* @lds.3, i32 0, i32 16) to i64 addrspace(3)*) to i64* store i64 1, i64* %ptr1, align 1 @@ -69,10 +85,11 @@ } ; @lds.1 is used from constant expressions in different kernels. -; Make sure we do not create a structure for it as we cannot handle it yet. define amdgpu_kernel void @k4(i64 %x) { ; CHECK-LABEL: @k4( -; CHECK-NEXT: %ptr = getelementptr inbounds i8, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(3)* @lds.1, i32 0, i32 0) to i8*), i64 %x +; CHECK-NEXT: %1 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k4.lds.t, %llvm.amdgcn.kernel.k4.lds.t addrspace(3)* @llvm.amdgcn.kernel.k4.lds, i32 0, i32 0), i32 0, i32 0 +; CHECK-NEXT: %2 = addrspacecast i8 addrspace(3)* %1 to i8* +; CHECK-NEXT: %ptr = getelementptr inbounds i8, i8* %2, i64 %x ; CHECK-NEXT: store i8 1, i8* %ptr, align 1 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll @@ -37,9 +37,19 @@ ; CHECK-LABEL: @timestwo() ; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] -; CHECK: %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 +; CHECK: %1 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)* +; CHECK: %2 = addrspacecast i32 addrspace(3)* %1 to i32* +; CHECK: %3 = ptrtoint i32* %2 to i64 +; CHECK: %4 = add i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64), %3 +; CHECK: %5 = inttoptr i64 %4 to i32* +; CHECK: %ld = load i32, i32* %5, align 4 ; CHECK: %mul = mul i32 %ld, 2 -; CHECK: store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 +; CHECK: %6 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)* +; CHECK: %7 = addrspacecast i32 addrspace(3)* %6 to i32* +; CHECK: %8 = ptrtoint i32* %7 to i64 +; CHECK: %9 = add i64 %8, ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64) +; CHECK: %10 = inttoptr i64 %9 to i32* +; CHECK: store i32 %mul, i32* %10, align 4 define amdgpu_kernel void @timestwo() { %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 %mul = mul i32 %ld, 2