diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -911,6 +911,7 @@ // Create a struct for each kernel for the non-module-scope variables. + IRBuilder<> Builder(M.getContext()); DenseMap KernelToReplacement; for (Function &Func : M.functions()) { if (Func.isDeclaration() || !isKernelLDS(&Func)) @@ -962,6 +963,9 @@ auto Replacement = createLDSVariableReplacement(M, VarName, KernelUsedVariables); + + // In case all uses are from called functions + markUsedByKernel(Builder, &Func, Replacement.SGV); // remove preserves existing codegen removeLocalVarsFromUsedLists(M, KernelUsedVariables); @@ -1156,8 +1160,6 @@ DenseSet Vec; Vec.insert(GV); - // TODO: Looks like a latent bug, Replacement may not be marked - // UsedByKernel here replaceLDSVariablesWithStruct(M, Vec, Replacement, [](Use &U) { return isa(U.getUser()); }); @@ -1172,11 +1174,6 @@ LLVMContext &Ctx = M.getContext(); IRBuilder<> Builder(Ctx); - for (size_t i = 0; i < OrderedKernels.size(); i++) { - markUsedByKernel(Builder, OrderedKernels[i], - KernelToReplacement[OrderedKernels[i]].SGV); - } - // The order must be consistent between lookup table and accesses to // lookup table std::vector TableLookupVariablesOrdered( diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -868,45 +868,44 @@ define amdgpu_kernel void @write2_sgemm_sequence(ptr addrspace(1) %C, i32 %lda, i32 %ldb, ptr addrspace(1) %in) #0 { ; CI-LABEL: write2_sgemm_sequence: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4 -; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s0, s[0:1], 0x0 -; CI-NEXT: s_lshl_b32 s1, s2, 2 -; CI-NEXT: s_add_i32 s2, s1, 0xc20 -; CI-NEXT: s_addk_i32 s1, 0xc60 -; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_lshl_b32 s0, s2, 2 +; CI-NEXT: s_add_i32 s1, s0, 0xc20 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; CI-NEXT: s_addk_i32 s0, 0xc60 +; CI-NEXT: v_mov_b32_e32 v2, s1 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; CI-NEXT: v_mov_b32_e32 v3, s0 -; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 -; CI-NEXT: v_mov_b32_e32 v0, s1 -; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 -; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v1 -; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 -; CI-NEXT: ds_write2_b32 v0, v2, v3 offset0:32 offset1:33 -; CI-NEXT: ds_write2_b32 v0, v2, v3 offset0:64 offset1:65 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: ds_write2_b32 v2, v0, v0 offset1:1 +; CI-NEXT: ds_write2_b32 v3, v0, v0 offset1:1 +; CI-NEXT: ds_write2_b32 v1, v0, v0 offset1:1 +; CI-NEXT: ds_write2_b32 v1, v0, v0 offset0:32 offset1:33 +; CI-NEXT: ds_write2_b32 v1, v0, v0 offset0:64 offset1:65 ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: write2_sgemm_sequence: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x10 -; GFX9-NEXT: s_lshl_b32 s2, s2, 2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX9-NEXT: s_add_i32 s1, s2, 0xc20 -; GFX9-NEXT: s_addk_i32 s2, 0xc60 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v0, s[0:1] +; GFX9-NEXT: s_lshl_b32 s0, s2, 2 +; GFX9-NEXT: s_add_i32 s1, s0, 0xc20 +; GFX9-NEXT: s_addk_i32 s0, 0xc60 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset1:1 -; GFX9-NEXT: ds_write2_b32 v2, v3, v4 offset1:1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v1 -; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset1:1 -; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset0:32 offset1:33 -; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset0:64 offset1:65 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ds_write2_b32 v2, v0, v0 offset1:1 +; GFX9-NEXT: ds_write2_b32 v3, v0, v0 offset1:1 +; GFX9-NEXT: ds_write2_b32 v1, v0, v0 offset1:1 +; GFX9-NEXT: ds_write2_b32 v1, v0, v0 offset0:32 offset1:33 +; GFX9-NEXT: ds_write2_b32 v1, v0, v0 offset0:64 offset1:65 ; GFX9-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1 %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr-phi.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr-phi.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr-phi.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr-phi.ll @@ -8,6 +8,7 @@ define amdgpu_kernel void @func(i32 %c) { ; CHECK-LABEL: @func( ; CHECK-NEXT: entry: +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.func.lds) ] ; CHECK-NEXT: switch i32 [[C:%.*]], label [[RETURN:%.*]] [ ; CHECK-NEXT: i32 0, label [[BB0:%.*]] ; CHECK-NEXT: i32 1, label [[BB1:%.*]] diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-ambiguous.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-ambiguous.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-ambiguous.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-ambiguous.ll @@ -9,6 +9,7 @@ @kernel.lds = addrspace(3) global i8 undef define amdgpu_kernel void @k0() { ; CHECK-LABEL: @k0( +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds) ] ; CHECK-NEXT: [[LD:%.*]] = load i8, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, align 1 ; CHECK-NEXT: [[MUL:%.*]] = mul i8 [[LD]], 2 ; CHECK-NEXT: store i8 [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, align 1 @@ -22,6 +23,7 @@ define amdgpu_kernel void @k1() { ; CHECK-LABEL: @k1( +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds) ] ; CHECK-NEXT: [[LD:%.*]] = load i8, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, align 1 ; CHECK-NEXT: [[MUL:%.*]] = mul i8 [[LD]], 3 ; CHECK-NEXT: store i8 [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-unambiguous.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-unambiguous.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-unambiguous.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-unambiguous.ll @@ -12,6 +12,7 @@ @k0.lds = addrspace(3) global i8 undef define amdgpu_kernel void @k0() { ; CHECK-LABEL: @k0( +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds) ] ; CHECK-NEXT: [[LD:%.*]] = load i8, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, align 1 ; CHECK-NEXT: [[MUL:%.*]] = mul i8 [[LD]], 2 ; CHECK-NEXT: store i8 [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, align 1 @@ -28,9 +29,9 @@ @f0.lds = addrspace(3) global i16 undef define void @f0() { ; MODULE-LABEL: @f0( -; MODULE-NEXT: [[LD:%.*]] = load i16, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !alias.scope !1, !noalias !4 +; MODULE-NEXT: [[LD:%.*]] = load i16, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !alias.scope [[META1:![0-9]+]], !noalias [[META4:![0-9]+]] ; MODULE-NEXT: [[MUL:%.*]] = mul i16 [[LD]], 3 -; MODULE-NEXT: store i16 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !alias.scope !1, !noalias !4 +; MODULE-NEXT: store i16 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !alias.scope [[META1]], !noalias [[META4]] ; MODULE-NEXT: ret void ; ; TABLE-LABEL: @f0( @@ -60,7 +61,7 @@ define amdgpu_kernel void @k_f0() { ; MODULE-LABEL: @k_f0( -; MODULE-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !alias.scope !5, !noalias !1 +; MODULE-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !alias.scope [[META5:![0-9]+]], !noalias [[META1]] ; MODULE-NEXT: call void @f0() ; MODULE-NEXT: ret void ; @@ -70,6 +71,7 @@ ; TABLE-NEXT: ret void ; ; K_OR_HY-LABEL: @k_f0( +; K_OR_HY-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k_f0.lds) ] ; K_OR_HY-NEXT: call void @f0() ; K_OR_HY-NEXT: ret void ; @@ -82,9 +84,9 @@ @both.lds = addrspace(3) global i32 undef define void @f_both() { ; MODULE-LABEL: @f_both( -; MODULE-NEXT: [[LD:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope !5, !noalias !4 +; MODULE-NEXT: [[LD:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope [[META5]], !noalias [[META4]] ; MODULE-NEXT: [[MUL:%.*]] = mul i32 [[LD]], 4 -; MODULE-NEXT: store i32 [[MUL]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope !5, !noalias !4 +; MODULE-NEXT: store i32 [[MUL]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope [[META5]], !noalias [[META4]] ; MODULE-NEXT: ret void ; ; TABLE-LABEL: @f_both( @@ -115,9 +117,9 @@ define amdgpu_kernel void @k0_both() { ; MODULE-LABEL: @k0_both( ; MODULE-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] -; MODULE-NEXT: [[LD:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope !5, !noalias !1 +; MODULE-NEXT: [[LD:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope [[META5]], !noalias [[META1]] ; MODULE-NEXT: [[MUL:%.*]] = mul i32 [[LD]], 5 -; MODULE-NEXT: store i32 [[MUL]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope !5, !noalias !1 +; MODULE-NEXT: store i32 [[MUL]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope [[META5]], !noalias [[META1]] ; MODULE-NEXT: call void @f_both() ; MODULE-NEXT: ret void ; @@ -130,6 +132,7 @@ ; TABLE-NEXT: ret void ; ; K_OR_HY-LABEL: @k0_both( +; K_OR_HY-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k0_both.lds) ] ; K_OR_HY-NEXT: [[LD:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.kernel.k0_both.lds, align 4 ; K_OR_HY-NEXT: [[MUL:%.*]] = mul i32 [[LD]], 5 ; K_OR_HY-NEXT: store i32 [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.k0_both.lds, align 4