diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -911,6 +911,7 @@ // Create a struct for each kernel for the non-module-scope variables. + IRBuilder<> Builder(M.getContext()); DenseMap KernelToReplacement; for (Function &Func : M.functions()) { if (Func.isDeclaration() || !isKernelLDS(&Func)) @@ -963,6 +964,12 @@ auto Replacement = createLDSVariableReplacement(M, VarName, KernelUsedVariables); + // If any indirect uses, create a direct use to ensure allocation + // TODO: Simpler to unconditionally mark used but that regresses + // codegen in noclobber-barrier + if (!LDSUsesInfo.indirect_access[&Func].empty()) + markUsedByKernel(Builder, &Func, Replacement.SGV); + // remove preserves existing codegen removeLocalVarsFromUsedLists(M, KernelUsedVariables); KernelToReplacement[&Func] = Replacement; @@ -1156,8 +1163,6 @@ DenseSet Vec; Vec.insert(GV); - // TODO: Looks like a latent bug, Replacement may not be marked - // UsedByKernel here replaceLDSVariablesWithStruct(M, Vec, Replacement, [](Use &U) { return isa(U.getUser()); }); @@ -1172,11 +1177,6 @@ LLVMContext &Ctx = M.getContext(); IRBuilder<> Builder(Ctx); - for (size_t i = 0; i < OrderedKernels.size(); i++) { - markUsedByKernel(Builder, OrderedKernels[i], - KernelToReplacement[OrderedKernels[i]].SGV); - } - // The order must be consistent between lookup table and accesses to // lookup table std::vector TableLookupVariablesOrdered( diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll @@ -17,6 +17,7 @@ ; GCN: ds_write_b8 [[NULL]], [[TWO]] offset:16 define amdgpu_kernel void @k0() { ; OPT-LABEL: @k0( +; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds) ] ; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] ; OPT-NEXT: store i8 1, ptr addrspace(3) @llvm.amdgcn.module.lds, align 1 ; OPT-NEXT: store i8 2, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, align 16 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-unambiguous.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-unambiguous.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-unambiguous.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-unambiguous.ll @@ -28,9 +28,9 @@ @f0.lds = addrspace(3) global i16 undef define void @f0() { ; MODULE-LABEL: @f0( -; MODULE-NEXT: [[LD:%.*]] = load i16, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !alias.scope !1, !noalias !4 +; MODULE-NEXT: [[LD:%.*]] = load i16, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !alias.scope [[META1:![0-9]+]], !noalias [[META4:![0-9]+]] ; MODULE-NEXT: [[MUL:%.*]] = mul i16 [[LD]], 3 -; MODULE-NEXT: store i16 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !alias.scope !1, !noalias !4 +; MODULE-NEXT: store i16 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !alias.scope [[META1]], !noalias [[META4]] ; MODULE-NEXT: ret void ; ; TABLE-LABEL: @f0( @@ -60,7 +60,7 @@ define amdgpu_kernel void @k_f0() { ; MODULE-LABEL: @k_f0( -; MODULE-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !alias.scope !5, !noalias !1 +; MODULE-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !alias.scope [[META5:![0-9]+]], !noalias [[META1]] ; MODULE-NEXT: call void @f0() ; MODULE-NEXT: ret void ; @@ -70,6 +70,7 @@ ; TABLE-NEXT: ret void ; ; K_OR_HY-LABEL: @k_f0( +; K_OR_HY-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k_f0.lds) ] ; K_OR_HY-NEXT: call void @f0() ; K_OR_HY-NEXT: ret void ; @@ -82,9 +83,9 @@ @both.lds = addrspace(3) global i32 undef define void @f_both() { ; MODULE-LABEL: @f_both( -; MODULE-NEXT: [[LD:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope !5, !noalias !4 +; MODULE-NEXT: [[LD:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope [[META5]], !noalias [[META4]] ; MODULE-NEXT: [[MUL:%.*]] = mul i32 [[LD]], 4 -; MODULE-NEXT: store i32 [[MUL]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope !5, !noalias !4 +; MODULE-NEXT: store i32 [[MUL]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope [[META5]], !noalias [[META4]] ; MODULE-NEXT: ret void ; ; TABLE-LABEL: @f_both( @@ -115,9 +116,9 @@ define amdgpu_kernel void @k0_both() { ; MODULE-LABEL: @k0_both( ; MODULE-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] -; MODULE-NEXT: [[LD:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope !5, !noalias !1 +; MODULE-NEXT: [[LD:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope [[META5]], !noalias [[META1]] ; MODULE-NEXT: [[MUL:%.*]] = mul i32 [[LD]], 5 -; MODULE-NEXT: store i32 [[MUL]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope !5, !noalias !1 +; MODULE-NEXT: store i32 [[MUL]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope [[META5]], !noalias [[META1]] ; MODULE-NEXT: call void @f_both() ; MODULE-NEXT: ret void ; @@ -130,6 +131,7 @@ ; TABLE-NEXT: ret void ; ; K_OR_HY-LABEL: @k0_both( +; K_OR_HY-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k0_both.lds) ] ; K_OR_HY-NEXT: [[LD:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.kernel.k0_both.lds, align 4 ; K_OR_HY-NEXT: [[MUL:%.*]] = mul i32 [[LD]], 5 ; K_OR_HY-NEXT: store i32 [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.k0_both.lds, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll @@ -19,12 +19,11 @@ ; OPT: @llvm.amdgcn.kernel.k123.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k123.lds.t undef, align 8, !absolute_symbol !2 ; OPT{LITERAL}: @llvm.amdgcn.lds.offset.table = internal addrspace(4) constant [2 x [1 x i32]] [[1 x i32] [i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds to i32)], [1 x i32] [i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds to i32)]] -;. define void @f0() { ; OPT-LABEL: @f0( -; OPT-NEXT: %ld = load float, ptr addrspace(3) @llvm.amdgcn.kernel.k01.lds, align 4 -; OPT-NEXT: %mul = fmul float %ld, 2.000000e+00 -; OPT-NEXT: store float %mul, ptr addrspace(3) @llvm.amdgcn.kernel.k01.lds, align 4 +; OPT-NEXT: [[LD:%.*]] = load float, ptr addrspace(3) @llvm.amdgcn.kernel.k01.lds, align 4 +; OPT-NEXT: [[MUL:%.*]] = fmul float [[LD]], 2.000000e+00 +; OPT-NEXT: store float [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.k01.lds, align 4 ; OPT-NEXT: ret void ; ; GCN-LABEL: f0: @@ -46,9 +45,9 @@ define void @f1() { ; OPT-LABEL: @f1( -; OPT-NEXT: %ld = load i16, ptr addrspace(3) @llvm.amdgcn.module.lds, align 16 -; OPT-NEXT: %mul = mul i16 %ld, 3 -; OPT-NEXT: store i16 %mul, ptr addrspace(3) @llvm.amdgcn.module.lds, align 16 +; OPT-NEXT: [[LD:%.*]] = load i16, ptr addrspace(3) @llvm.amdgcn.module.lds, align 16 +; OPT-NEXT: [[MUL:%.*]] = mul i16 [[LD]], 3 +; OPT-NEXT: store i16 [[MUL]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 16 ; OPT-NEXT: ret void ; ; GCN-LABEL: f1: @@ -70,16 +69,16 @@ define void @f2() { ; OPT-LABEL: @f2( -; OPT-NEXT: %1 = call i32 @llvm.amdgcn.lds.kernel.id() -; OPT-NEXT: %v22 = getelementptr inbounds [2 x [1 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 %1, i32 0 -; OPT-NEXT: %2 = load i32, ptr addrspace(4) %v22, align 4 -; OPT-NEXT: %v23 = inttoptr i32 %2 to ptr addrspace(3) -; OPT-NEXT: %ld = load i64, ptr addrspace(3) %v23, align 4 -; OPT-NEXT: %mul = mul i64 %ld, 4 -; OPT-NEXT: %v2 = getelementptr inbounds [2 x [1 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 %1, i32 0 -; OPT-NEXT: %3 = load i32, ptr addrspace(4) %v2, align 4 -; OPT-NEXT: %v21 = inttoptr i32 %3 to ptr addrspace(3) -; OPT-NEXT: store i64 %mul, ptr addrspace(3) %v21, align 4 +; OPT-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() +; OPT-NEXT: [[V22:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0 +; OPT-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[V22]], align 4 +; OPT-NEXT: [[V23:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3) +; OPT-NEXT: [[LD:%.*]] = load i64, ptr addrspace(3) [[V23]], align 4 +; OPT-NEXT: [[MUL:%.*]] = mul i64 [[LD]], 4 +; OPT-NEXT: [[V2:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0 +; OPT-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[V2]], align 4 +; OPT-NEXT: [[V21:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3) +; OPT-NEXT: store i64 [[MUL]], ptr addrspace(3) [[V21]], align 4 ; OPT-NEXT: ret void ; ; GCN-LABEL: f2: @@ -111,9 +110,9 @@ define void @f3() { ; OPT-LABEL: @f3( -; OPT-NEXT: %ld = load i8, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k23.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds, i32 0, i32 1), align 8 -; OPT-NEXT: %mul = mul i8 %ld, 5 -; OPT-NEXT: store i8 %mul, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k23.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds, i32 0, i32 1), align 8 +; OPT-NEXT: [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K23_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds, i32 0, i32 1), align 8 +; OPT-NEXT: [[MUL:%.*]] = mul i8 [[LD]], 5 +; OPT-NEXT: store i8 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K23_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds, i32 0, i32 1), align 8 ; OPT-NEXT: ret void ; ; GCN-LABEL: f3: @@ -136,9 +135,9 @@ ; Doesn't access any via a function, won't be in the lookup table define amdgpu_kernel void @kernel_no_table() { ; OPT-LABEL: @kernel_no_table( -; OPT-NEXT: %ld = load i64, ptr addrspace(3) @llvm.amdgcn.kernel.kernel_no_table.lds, align 8 -; OPT-NEXT: %mul = mul i64 %ld, 8 -; OPT-NEXT: store i64 %mul, ptr addrspace(3) @llvm.amdgcn.kernel.kernel_no_table.lds, align 8 +; OPT-NEXT: [[LD:%.*]] = load i64, ptr addrspace(3) @llvm.amdgcn.kernel.kernel_no_table.lds, align 8 +; OPT-NEXT: [[MUL:%.*]] = mul i64 [[LD]], 8 +; OPT-NEXT: store i64 [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.kernel_no_table.lds, align 8 ; OPT-NEXT: ret void ; ; GCN-LABEL: kernel_no_table: @@ -159,6 +158,7 @@ ; Access two variables, will allocate those two define amdgpu_kernel void @k01() { ; OPT-LABEL: @k01( +; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k01.lds) ] ; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] ; OPT-NEXT: call void @f0() ; OPT-NEXT: call void @f1() @@ -193,7 +193,7 @@ define amdgpu_kernel void @k23() { ; OPT-LABEL: @k23( -; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds) ] +; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds) ], !alias.scope [[META4:![0-9]+]], !noalias [[META7:![0-9]+]] ; OPT-NEXT: call void @f2() ; OPT-NEXT: call void @f3() ; OPT-NEXT: ret void @@ -231,12 +231,12 @@ ; Access and allocate three variables define amdgpu_kernel void @k123() { ; OPT-LABEL: @k123( -; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds) ] +; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds) ], !alias.scope [[META10:![0-9]+]], !noalias [[META13:![0-9]+]] ; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] ; OPT-NEXT: call void @f1() -; OPT-NEXT: %ld = load i8, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k123.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope !5, !noalias !8 -; OPT-NEXT: %mul = mul i8 %ld, 8 -; OPT-NEXT: store i8 %mul, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k123.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope !5, !noalias !8 +; OPT-NEXT: [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope [[META13]], !noalias [[META10]] +; OPT-NEXT: [[MUL:%.*]] = mul i8 [[LD]], 8 +; OPT-NEXT: store i8 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope [[META13]], !noalias [[META10]] ; OPT-NEXT: call void @f2() ; OPT-NEXT: ret void ; @@ -284,22 +284,25 @@ !2 = !{i32 1} -;. ; OPT: attributes #0 = { "amdgpu-elide-module-lds" } ; OPT: attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) } ; OPT: attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -;. + ; OPT: !0 = !{i64 0, i64 1} ; OPT: !1 = !{i64 4, i64 5} ; OPT: !2 = !{i64 8, i64 9} ; OPT: !3 = !{i32 1} -; OPT: !4 = !{i32 0} -; OPT: !5 = !{!6} -; OPT: !6 = distinct !{!6, !7} -; OPT: !7 = distinct !{!7} -; OPT: !8 = !{!9} -; OPT: !9 = distinct !{!9, !7} -;. +; OPT: !4 = !{!5} +; OPT: !5 = distinct !{!5, !6} +; OPT: !6 = distinct !{!6} +; OPT: !7 = !{!8} +; OPT: !8 = distinct !{!8, !6} +; OPT: !9 = !{i32 0} +; OPT: !10 = !{!11} +; OPT: !11 = distinct !{!11, !12} +; OPT: !12 = distinct !{!12} +; OPT: !13 = !{!14} +; OPT: !14 = distinct !{!14, !12} ; Table size length number-kernels * number-variables * sizeof(uint16_t) ; GCN: .type llvm.amdgcn.lds.offset.table,@object diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll @@ -256,7 +256,7 @@ } define amdgpu_kernel void @k23() { -; OPT-LABEL: @k23() !llvm.amdgcn.lds.kernel.id !2 { +; OPT-LABEL: @k23() !llvm.amdgcn.lds.kernel.id !7 { ; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds) ] ; OPT-NEXT: call void @f2() ; OPT-NEXT: call void @f3() @@ -295,12 +295,12 @@ ; Access and allocate three variables define amdgpu_kernel void @k123() { -; OPT-LABEL: @k123() !llvm.amdgcn.lds.kernel.id !3 { +; OPT-LABEL: @k123() !llvm.amdgcn.lds.kernel.id !13 { ; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds) ] ; OPT-NEXT: call void @f1() -; OPT-NEXT: [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 2, !alias.scope !4, !noalias !7 +; OPT-NEXT: [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 2, !alias.scope !20, !noalias !21 ; OPT-NEXT: [[MUL:%.*]] = mul i8 [[LD]], 8 -; OPT-NEXT: store i8 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 2, !alias.scope !4, !noalias !7 +; OPT-NEXT: store i8 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 2, !alias.scope !20, !noalias !21 ; OPT-NEXT: call void @f2() ; OPT-NEXT: ret void ;