diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -911,6 +911,7 @@ // Create a struct for each kernel for the non-module-scope variables. + IRBuilder<> Builder(M.getContext()); DenseMap KernelToReplacement; for (Function &Func : M.functions()) { if (Func.isDeclaration() || !isKernelLDS(&Func)) @@ -963,6 +964,9 @@ auto Replacement = createLDSVariableReplacement(M, VarName, KernelUsedVariables); + // In case all uses are from called functions + markUsedByKernel(Builder, &Func, Replacement.SGV); + // remove preserves existing codegen removeLocalVarsFromUsedLists(M, KernelUsedVariables); KernelToReplacement[&Func] = Replacement; @@ -1156,8 +1160,6 @@ DenseSet Vec; Vec.insert(GV); - // TODO: Looks like a latent bug, Replacement may not be marked - // UsedByKernel here replaceLDSVariablesWithStruct(M, Vec, Replacement, [](Use &U) { return isa(U.getUser()); }); @@ -1172,11 +1174,6 @@ LLVMContext &Ctx = M.getContext(); IRBuilder<> Builder(Ctx); - for (size_t i = 0; i < OrderedKernels.size(); i++) { - markUsedByKernel(Builder, OrderedKernels[i], - KernelToReplacement[OrderedKernels[i]].SGV); - } - // The order must be consistent between lookup table and accesses to // lookup table std::vector TableLookupVariablesOrdered( diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -868,45 +868,44 @@ define amdgpu_kernel void @write2_sgemm_sequence(ptr addrspace(1) %C, i32 %lda, i32 %ldb, ptr addrspace(1) %in) #0 { ; CI-LABEL: write2_sgemm_sequence: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4 -; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s0, s[0:1], 0x0 -; CI-NEXT: s_lshl_b32 s1, s2, 2 -; CI-NEXT: s_add_i32 s2, s1, 0xc20 -; CI-NEXT: s_addk_i32 s1, 0xc60 -; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_lshl_b32 s0, s2, 2 +; CI-NEXT: s_add_i32 s1, s0, 0xc20 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; CI-NEXT: s_addk_i32 s0, 0xc60 +; CI-NEXT: v_mov_b32_e32 v2, s1 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; CI-NEXT: v_mov_b32_e32 v3, s0 -; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 -; CI-NEXT: v_mov_b32_e32 v0, s1 -; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 -; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v1 -; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 -; CI-NEXT: ds_write2_b32 v0, v2, v3 offset0:32 offset1:33 -; CI-NEXT: ds_write2_b32 v0, v2, v3 offset0:64 offset1:65 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: ds_write2_b32 v2, v0, v0 offset1:1 +; CI-NEXT: ds_write2_b32 v3, v0, v0 offset1:1 +; CI-NEXT: ds_write2_b32 v1, v0, v0 offset1:1 +; CI-NEXT: ds_write2_b32 v1, v0, v0 offset0:32 offset1:33 +; CI-NEXT: ds_write2_b32 v1, v0, v0 offset0:64 offset1:65 ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: write2_sgemm_sequence: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x10 -; GFX9-NEXT: s_lshl_b32 s2, s2, 2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX9-NEXT: s_add_i32 s1, s2, 0xc20 -; GFX9-NEXT: s_addk_i32 s2, 0xc60 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v0, s[0:1] +; GFX9-NEXT: s_lshl_b32 s0, s2, 2 +; GFX9-NEXT: s_add_i32 s1, s0, 0xc20 +; GFX9-NEXT: s_addk_i32 s0, 0xc60 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset1:1 -; GFX9-NEXT: ds_write2_b32 v2, v3, v4 offset1:1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v1 -; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset1:1 -; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset0:32 offset1:33 -; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset0:64 offset1:65 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ds_write2_b32 v2, v0, v0 offset1:1 +; GFX9-NEXT: ds_write2_b32 v3, v0, v0 offset1:1 +; GFX9-NEXT: ds_write2_b32 v1, v0, v0 offset1:1 +; GFX9-NEXT: ds_write2_b32 v1, v0, v0 offset0:32 offset1:33 +; GFX9-NEXT: ds_write2_b32 v1, v0, v0 offset0:64 offset1:65 ; GFX9-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1 %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1 diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll --- a/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll @@ -23,10 +23,11 @@ ;. define amdgpu_kernel void @k0() #0 { ; CHECK-LABEL: @k0( -; CHECK-NEXT: store i8 1, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 3), align 2, !alias.scope !1, !noalias !4 -; CHECK-NEXT: store i8 2, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 2), align 4, !alias.scope !8, !noalias !9 -; CHECK-NEXT: store i8 4, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1), align 16, !alias.scope !10, !noalias !11 -; CHECK-NEXT: store i8 16, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, align 16, !alias.scope !12, !noalias !13 +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds) ], !alias.scope !1, !noalias !4 +; CHECK-NEXT: store i8 1, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 3), align 2, !alias.scope !8, !noalias !9 +; CHECK-NEXT: store i8 2, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 2), align 4, !alias.scope !10, !noalias !11 +; CHECK-NEXT: store i8 4, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1), align 16, !alias.scope !12, !noalias !13 +; CHECK-NEXT: store i8 16, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, align 16, !alias.scope !1, !noalias !4 ; CHECK-NEXT: ret void store i8 1, ptr addrspace(3) @lds.size.1.align.1, align 1 @@ -41,9 +42,10 @@ define amdgpu_kernel void @k1() #0 { ; CHECK-LABEL: @k1( -; CHECK-NEXT: store i8 2, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, i32 0, i32 2), align 4, !alias.scope !14, !noalias !17 -; CHECK-NEXT: store i8 4, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1), align 16, !alias.scope !20, !noalias !21 -; CHECK-NEXT: store i8 16, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, align 16, !alias.scope !22, !noalias !23 +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds) ], !alias.scope !14, !noalias !17 +; CHECK-NEXT: store i8 2, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, i32 0, i32 2), align 4, !alias.scope !20, !noalias !21 +; CHECK-NEXT: store i8 4, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1), align 16, !alias.scope !22, !noalias !23 +; CHECK-NEXT: store i8 16, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, align 16, !alias.scope !14, !noalias !17 ; CHECK-NEXT: ret void ; store i8 2, ptr addrspace(3) @lds.size.2.align.2, align 2 @@ -57,8 +59,9 @@ define amdgpu_kernel void @k2() #0 { ; CHECK-LABEL: @k2( -; CHECK-NEXT: store i8 2, ptr addrspace(3) @llvm.amdgcn.kernel.k2.lds, align 2 -; CHECK-NEXT: ret void +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k2.lds) ] +; CHECK-NEXT: store i8 2, ptr addrspace(3) @llvm.amdgcn.kernel.k2.lds, align 2 +; CHECK-NEXT: ret void ; store i8 2, ptr addrspace(3) @lds.size.2.align.2, align 2 @@ -67,8 +70,9 @@ define amdgpu_kernel void @k3() #0 { ; CHECK-LABEL: @k3( -; CHECK-NEXT: store i8 4, ptr addrspace(3) @llvm.amdgcn.kernel.k3.lds, align 4 -; CHECK-NEXT: ret void +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k3.lds) ] +; CHECK-NEXT: store i8 4, ptr addrspace(3) @llvm.amdgcn.kernel.k3.lds, align 4 +; CHECK-NEXT: ret void ; store i8 4, ptr addrspace(3) @lds.size.4.align.4, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll --- a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll @@ -13,17 +13,17 @@ ; CHECK: %llvm.amdgcn.kernel.k6.lds.t = type { [4 x i32] } ; Use constant from different kernels -;. -; CHECK: @llvm.amdgcn.kernel.k0.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k0.lds.t undef, align 2 -; CHECK: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t undef, align 2 -; CHECK: @llvm.amdgcn.kernel.k2.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k2.lds.t undef, align 4 -; CHECK: @llvm.amdgcn.kernel.k3.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k3.lds.t undef, align 16 -; CHECK: @llvm.amdgcn.kernel.k4.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k4.lds.t undef, align 2 -; CHECK: @llvm.amdgcn.kernel.k5.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k5.lds.t undef, align 16 -; CHECK: @llvm.amdgcn.kernel.k6.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k6.lds.t undef, align 16 +; CHECK: @llvm.amdgcn.kernel.k0.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k0.lds.t undef, align 2, !absolute_symbol !0 +; CHECK: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t undef, align 2, !absolute_symbol !0 +; CHECK: @llvm.amdgcn.kernel.k2.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k2.lds.t undef, align 4, !absolute_symbol !0 +; CHECK: @llvm.amdgcn.kernel.k3.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k3.lds.t undef, align 16, !absolute_symbol !0 +; CHECK: @llvm.amdgcn.kernel.k4.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k4.lds.t undef, align 2, !absolute_symbol !0 +; CHECK: @llvm.amdgcn.kernel.k5.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k5.lds.t undef, align 16, !absolute_symbol !0 +; CHECK: @llvm.amdgcn.kernel.k6.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k6.lds.t undef, align 16, !absolute_symbol !0 ;. define amdgpu_kernel void @k0(i64 %x) { ; CHECK-LABEL: @k0( +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds) ] ; CHECK-NEXT: %1 = addrspacecast ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds to ptr ; CHECK-NEXT: %ptr = getelementptr inbounds i8, ptr %1, i64 %x ; CHECK-NEXT: store i8 1, ptr %ptr, align 1 @@ -36,6 +36,7 @@ define amdgpu_kernel void @k1(i64 %x) { ; CHECK-LABEL: @k1( +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds) ] ; CHECK-NEXT: %1 = addrspacecast ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds to ptr ; CHECK-NEXT: %ptr = getelementptr inbounds i8, ptr %1, i64 %x ; CHECK-NEXT: store i8 1, ptr %ptr, align 1 @@ -51,6 +52,7 @@ ; Use constant twice from the same kernel define amdgpu_kernel void @k2(i64 %x) { ; CHECK-LABEL: @k2( +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k2.lds) ] ; CHECK-NEXT: store i8 1, ptr addrspace(3) @llvm.amdgcn.kernel.k2.lds, align 4 ; CHECK-NEXT: store i8 2, ptr addrspace(3) @llvm.amdgcn.kernel.k2.lds, align 4 ; CHECK-NEXT: ret void @@ -65,9 +67,10 @@ ; Use constant twice from the same kernel but a different other constant. define amdgpu_kernel void @k3(i64 %x) { ; CHECK-LABEL: @k3( +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k3.lds) ] ; CHECK-NEXT: %1 = getelementptr inbounds [32 x i8], ptr addrspace(3) @llvm.amdgcn.kernel.k3.lds, i32 0, i32 16 ; CHECK-NEXT: %ptr1 = addrspacecast ptr addrspace(3) %1 to ptr -; CHECK-NEXT: store i64 1, ptr %ptr1, align 1 +; CHECK-NEXT: store i64 1, ptr %ptr1, align 16 ; CHECK-NEXT: %2 = getelementptr inbounds [32 x i8], ptr addrspace(3) @llvm.amdgcn.kernel.k3.lds, i32 0, i32 24 ; CHECK-NEXT: %ptr2 = addrspacecast ptr addrspace(3) %2 to ptr ; CHECK-NEXT: store i64 2, ptr %ptr2, align 8 @@ -83,6 +86,7 @@ ; @lds.1 is used from constant expressions in different kernels. define amdgpu_kernel void @k4(i64 %x) { ; CHECK-LABEL: @k4( +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k4.lds) ] ; CHECK-NEXT: %1 = addrspacecast ptr addrspace(3) @llvm.amdgcn.kernel.k4.lds to ptr ; CHECK-NEXT: %ptr = getelementptr inbounds i8, ptr %1, i64 %x ; CHECK-NEXT: store i8 1, ptr %ptr, align 1 @@ -98,9 +102,11 @@ ; Multiple constexpr use in a same instruction. define amdgpu_kernel void @k5() { ; CHECK-LABEL: @k5( -; CHECK-NEXT: %1 = addrspacecast ptr addrspace(3) @llvm.amdgcn.kernel.k5.lds to ptr -; CHECK-NEXT: %2 = addrspacecast ptr addrspace(3) @llvm.amdgcn.kernel.k5.lds to ptr -; CHECK-NEXT: call void undef(ptr %1, ptr %2) +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k5.lds) ] +; CHECK-NEXT: %1 = addrspacecast ptr addrspace(3) @llvm.amdgcn.kernel.k5.lds to ptr +; CHECK-NEXT: %2 = addrspacecast ptr addrspace(3) @llvm.amdgcn.kernel.k5.lds to ptr +; CHECK-NEXT: call void undef(ptr %1, ptr %2) +; CHECK-NEXT: ret void ; call void undef(ptr addrspacecast (ptr addrspace(3) @lds.4 to ptr), ptr addrspacecast (ptr addrspace(3) @lds.4 to ptr)) ret void @@ -113,13 +119,19 @@ ; expression operands of store should be replaced by equivalent instruction sequences. define amdgpu_kernel void @k6() { ; CHECK-LABEL: @k6( - -; CHECK-NEXT: %1 = getelementptr inbounds [4 x i32], ptr addrspace(3) @llvm.amdgcn.kernel.k6.lds, i32 0, i32 2 -; CHECK-NEXT: %2 = ptrtoint ptr addrspace(3) %1 to i32 -; CHECK-NEXT: %3 = getelementptr inbounds [4 x i32], ptr addrspace(3) @llvm.amdgcn.kernel.k6.lds, i32 0, i32 2 -; CHECK-NEXT: store i32 %2, ptr addrspace(3) %3, align 8 -; CHECK-NEXT: ret void +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k6.lds) ] +; CHECK-NEXT: %1 = getelementptr inbounds [4 x i32], ptr addrspace(3) @llvm.amdgcn.kernel.k6.lds, i32 0, i32 2 +; CHECK-NEXT: %2 = ptrtoint ptr addrspace(3) %1 to i32 +; CHECK-NEXT: %3 = getelementptr inbounds [4 x i32], ptr addrspace(3) @llvm.amdgcn.kernel.k6.lds, i32 0, i32 2 +; CHECK-NEXT: store i32 %2, ptr addrspace(3) %3, align 8 +; CHECK-NEXT: ret void ; + store i32 ptrtoint (ptr addrspace(3) getelementptr inbounds ([4 x i32], ptr addrspace(3) @lds.5, i32 0, i32 2) to i32), ptr addrspace(3) getelementptr inbounds ([4 x i32], ptr addrspace(3) @lds.5, i32 0, i32 2) ret void } +;. +; CHECK: attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) } +;. +; CHECK: !0 = !{i64 0, i64 1} +;. diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll --- a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll @@ -16,10 +16,11 @@ ;. define amdgpu_kernel void @k0() { ; CHECK-LABEL: @k0( -; CHECK-NEXT: store i8 1, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 3), align 2, !alias.scope !1, !noalias !4 -; CHECK-NEXT: store i8 2, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 2), align 4, !alias.scope !8, !noalias !9 -; CHECK-NEXT: store i8 4, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1), align 16, !alias.scope !10, !noalias !11 -; CHECK-NEXT: store i8 16, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, align 16, !alias.scope !12, !noalias !13 +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds) ], !alias.scope !1, !noalias !4 +; CHECK-NEXT: store i8 1, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 3), align 2, !alias.scope !8, !noalias !9 +; CHECK-NEXT: store i8 2, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 2), align 4, !alias.scope !10, !noalias !11 +; CHECK-NEXT: store i8 4, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1), align 16, !alias.scope !12, !noalias !13 +; CHECK-NEXT: store i8 16, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, align 16, !alias.scope !1, !noalias !4 ; CHECK-NEXT: ret void store i8 1, ptr addrspace(3) @lds.size.1.align.1, align 1 @@ -34,9 +35,10 @@ define amdgpu_kernel void @k1() { ; CHECK-LABEL: @k1( -; CHECK-NEXT: store i8 2, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, i32 0, i32 2), align 4, !alias.scope !14, !noalias !17 -; CHECK-NEXT: store i8 4, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1), align 16, !alias.scope !20, !noalias !21 -; CHECK-NEXT: store i8 16, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, align 16, !alias.scope !22, !noalias !23 +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds) ], !alias.scope !14, !noalias !17 +; CHECK-NEXT: store i8 2, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, i32 0, i32 2), align 4, !alias.scope !20, !noalias !21 +; CHECK-NEXT: store i8 4, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1), align 16, !alias.scope !22, !noalias !23 +; CHECK-NEXT: store i8 16, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, align 16, !alias.scope !14, !noalias !17 ; CHECK-NEXT: ret void ; store i8 2, ptr addrspace(3) @lds.size.2.align.2, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll --- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll @@ -33,6 +33,7 @@ ; GCN-NEXT: s_endpgm ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.test.lds) ], !alias.scope !1, !noalias !4 ; CHECK-NEXT: store i8 3, ptr addrspace(3) @llvm.amdgcn.kernel.test.lds, align 4, !alias.scope !1, !noalias !4 ; CHECK-NEXT: tail call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noundef align 1 dereferenceable(3) getelementptr inbounds (%llvm.amdgcn.kernel.test.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.test.lds, i32 0, i32 2), ptr addrspace(3) noundef align 1 dereferenceable(3) @llvm.amdgcn.kernel.test.lds, i64 3, i1 false), !alias.scope !6, !noalias !7 ; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.test.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.test.lds, i32 0, i32 2), align 4, !alias.scope !4, !noalias !1 diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll --- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll @@ -5,10 +5,11 @@ @b = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4 ; CHECK-LABEL: @no_clobber_ds_load_stores_x2_preexisting_aa -; CHECK: store i32 1, ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2_preexisting_aa.lds, align 16, !tbaa !1, !noalias !6 -; CHECK: %val.a = load i32, ptr addrspace(3) %gep.a, align 4, !tbaa !1, !noalias !6 -; CHECK: store i32 2, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2_preexisting_aa.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2_preexisting_aa.lds, i32 0, i32 1), align 16, !tbaa !1, !noalias !6 -; CHECK: %val.b = load i32, ptr addrspace(3) %gep.b, align 4, !tbaa !1, !noalias !6 +; CHECK: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2_preexisting_aa.lds) ], !alias.scope !1, !noalias !4 +; CHECK: store i32 1, ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2_preexisting_aa.lds, align 16, !tbaa !6, !noalias !11 +; CHECK: %val.a = load i32, ptr addrspace(3) %gep.a, align 4, !tbaa !6, !noalias !11 +; CHECK: store i32 2, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2_preexisting_aa.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2_preexisting_aa.lds, i32 0, i32 1), align 16, !tbaa !6, !noalias !11 +; CHECK: %val.b = load i32, ptr addrspace(3) %gep.b, align 4, !tbaa !6, !noalias !11 define amdgpu_kernel void @no_clobber_ds_load_stores_x2_preexisting_aa(ptr addrspace(1) %arg, i32 %i) { bb: @@ -34,10 +35,15 @@ !8 = !{!"omnipotent char", !9, i64 0} !9 = !{!"Simple C++ TBAA"} -; CHECK:!0 = !{i64 0, i64 1} -; CHECK:!1 = !{!2, !3, i64 0} -; CHECK:!2 = !{!"no_clobber_ds_load_stores_x2_preexisting_aa", !3, i64 0} -; CHECK:!3 = !{!"int", !4, i64 0} -; CHECK:!4 = !{!"omnipotent char", !5, i64 0} -; CHECK:!5 = !{!"Simple C++ TBAA"} -; CHECK:!6 = !{} +; CHECK: !0 = !{i64 0, i64 1} +; CHECK: !1 = !{!2} +; CHECK: !2 = distinct !{!2, !3} +; CHECK: !3 = distinct !{!3} +; CHECK: !4 = !{!5} +; CHECK: !5 = distinct !{!5, !3} +; CHECK: !6 = !{!7, !8, i64 0} +; CHECK: !7 = !{!"no_clobber_ds_load_stores_x2_preexisting_aa", !8, i64 0} +; CHECK: !8 = !{!"int", !9, i64 0} +; CHECK: !9 = !{!"omnipotent char", !10, i64 0} +; CHECK: !10 = !{!"Simple C++ TBAA"} +; CHECK: !11 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr-phi.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr-phi.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr-phi.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr-phi.ll @@ -8,6 +8,7 @@ define amdgpu_kernel void @func(i32 %c) { ; CHECK-LABEL: @func( ; CHECK-NEXT: entry: +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.func.lds) ] ; CHECK-NEXT: switch i32 [[C:%.*]], label [[RETURN:%.*]] [ ; CHECK-NEXT: i32 0, label [[BB0:%.*]] ; CHECK-NEXT: i32 1, label [[BB1:%.*]] diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll @@ -49,8 +49,7 @@ } ; CHECK-LABEL: @timestwo() #0 -; CHECK-NOT: call void @llvm.donothing() - +; CHECK: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.timestwo.lds) ] ; CHECK: %1 = addrspacecast ptr addrspace(3) @llvm.amdgcn.kernel.timestwo.lds to ptr ; CHECK: %2 = ptrtoint ptr %1 to i64 ; CHECK: %3 = addrspacecast ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 1) to ptr diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll @@ -17,6 +17,7 @@ ; GCN: ds_write_b8 [[NULL]], [[TWO]] offset:16 define amdgpu_kernel void @k0() { ; OPT-LABEL: @k0( +; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds) ] ; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] ; OPT-NEXT: store i8 1, ptr addrspace(3) @llvm.amdgcn.module.lds, align 1 ; OPT-NEXT: store i8 2, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, align 16 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-ambiguous.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-ambiguous.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-ambiguous.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-ambiguous.ll @@ -9,6 +9,7 @@ @kernel.lds = addrspace(3) global i8 undef define amdgpu_kernel void @k0() { ; CHECK-LABEL: @k0( +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds) ] ; CHECK-NEXT: [[LD:%.*]] = load i8, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, align 1 ; CHECK-NEXT: [[MUL:%.*]] = mul i8 [[LD]], 2 ; CHECK-NEXT: store i8 [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, align 1 @@ -22,6 +23,7 @@ define amdgpu_kernel void @k1() { ; CHECK-LABEL: @k1( +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds) ] ; CHECK-NEXT: [[LD:%.*]] = load i8, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, align 1 ; CHECK-NEXT: [[MUL:%.*]] = mul i8 [[LD]], 3 ; CHECK-NEXT: store i8 [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-unambiguous.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-unambiguous.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-unambiguous.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-unambiguous.ll @@ -12,6 +12,7 @@ @k0.lds = addrspace(3) global i8 undef define amdgpu_kernel void @k0() { ; CHECK-LABEL: @k0( +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds) ] ; CHECK-NEXT: [[LD:%.*]] = load i8, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, align 1 ; CHECK-NEXT: [[MUL:%.*]] = mul i8 [[LD]], 2 ; CHECK-NEXT: store i8 [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, align 1 @@ -28,9 +29,9 @@ @f0.lds = addrspace(3) global i16 undef define void @f0() { ; MODULE-LABEL: @f0( -; MODULE-NEXT: [[LD:%.*]] = load i16, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !alias.scope !1, !noalias !4 +; MODULE-NEXT: [[LD:%.*]] = load i16, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !alias.scope [[META1:![0-9]+]], !noalias [[META4:![0-9]+]] ; MODULE-NEXT: [[MUL:%.*]] = mul i16 [[LD]], 3 -; MODULE-NEXT: store i16 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !alias.scope !1, !noalias !4 +; MODULE-NEXT: store i16 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !alias.scope [[META1]], !noalias [[META4]] ; MODULE-NEXT: ret void ; ; TABLE-LABEL: @f0( @@ -60,7 +61,7 @@ define amdgpu_kernel void @k_f0() { ; MODULE-LABEL: @k_f0( -; MODULE-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !alias.scope !5, !noalias !1 +; MODULE-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !alias.scope [[META5:![0-9]+]], !noalias [[META1]] ; MODULE-NEXT: call void @f0() ; MODULE-NEXT: ret void ; @@ -70,6 +71,7 @@ ; TABLE-NEXT: ret void ; ; K_OR_HY-LABEL: @k_f0( +; K_OR_HY-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k_f0.lds) ] ; K_OR_HY-NEXT: call void @f0() ; K_OR_HY-NEXT: ret void ; @@ -82,9 +84,9 @@ @both.lds = addrspace(3) global i32 undef define void @f_both() { ; MODULE-LABEL: @f_both( -; MODULE-NEXT: [[LD:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope !5, !noalias !4 +; MODULE-NEXT: [[LD:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope [[META5]], !noalias [[META4]] ; MODULE-NEXT: [[MUL:%.*]] = mul i32 [[LD]], 4 -; MODULE-NEXT: store i32 [[MUL]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope !5, !noalias !4 +; MODULE-NEXT: store i32 [[MUL]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope [[META5]], !noalias [[META4]] ; MODULE-NEXT: ret void ; ; TABLE-LABEL: @f_both( @@ -115,9 +117,9 @@ define amdgpu_kernel void @k0_both() { ; MODULE-LABEL: @k0_both( ; MODULE-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] -; MODULE-NEXT: [[LD:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope !5, !noalias !1 +; MODULE-NEXT: [[LD:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope [[META5]], !noalias [[META1]] ; MODULE-NEXT: [[MUL:%.*]] = mul i32 [[LD]], 5 -; MODULE-NEXT: store i32 [[MUL]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope !5, !noalias !1 +; MODULE-NEXT: store i32 [[MUL]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope [[META5]], !noalias [[META1]] ; MODULE-NEXT: call void @f_both() ; MODULE-NEXT: ret void ; @@ -130,6 +132,7 @@ ; TABLE-NEXT: ret void ; ; K_OR_HY-LABEL: @k0_both( +; K_OR_HY-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k0_both.lds) ] ; K_OR_HY-NEXT: [[LD:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.kernel.k0_both.lds, align 4 ; K_OR_HY-NEXT: [[MUL:%.*]] = mul i32 [[LD]], 5 ; K_OR_HY-NEXT: store i32 [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.k0_both.lds, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll @@ -19,12 +19,11 @@ ; OPT: @llvm.amdgcn.kernel.k123.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k123.lds.t undef, align 8, !absolute_symbol !2 ; OPT{LITERAL}: @llvm.amdgcn.lds.offset.table = internal addrspace(4) constant [2 x [1 x i32]] [[1 x i32] [i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds to i32)], [1 x i32] [i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds to i32)]] -;. define void @f0() { ; OPT-LABEL: @f0( -; OPT-NEXT: %ld = load float, ptr addrspace(3) @llvm.amdgcn.kernel.k01.lds, align 4 -; OPT-NEXT: %mul = fmul float %ld, 2.000000e+00 -; OPT-NEXT: store float %mul, ptr addrspace(3) @llvm.amdgcn.kernel.k01.lds, align 4 +; OPT-NEXT: [[LD:%.*]] = load float, ptr addrspace(3) @llvm.amdgcn.kernel.k01.lds, align 4 +; OPT-NEXT: [[MUL:%.*]] = fmul float [[LD]], 2.000000e+00 +; OPT-NEXT: store float [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.k01.lds, align 4 ; OPT-NEXT: ret void ; ; GCN-LABEL: f0: @@ -46,9 +45,9 @@ define void @f1() { ; OPT-LABEL: @f1( -; OPT-NEXT: %ld = load i16, ptr addrspace(3) @llvm.amdgcn.module.lds, align 16 -; OPT-NEXT: %mul = mul i16 %ld, 3 -; OPT-NEXT: store i16 %mul, ptr addrspace(3) @llvm.amdgcn.module.lds, align 16 +; OPT-NEXT: [[LD:%.*]] = load i16, ptr addrspace(3) @llvm.amdgcn.module.lds, align 16 +; OPT-NEXT: [[MUL:%.*]] = mul i16 [[LD]], 3 +; OPT-NEXT: store i16 [[MUL]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 16 ; OPT-NEXT: ret void ; ; GCN-LABEL: f1: @@ -70,16 +69,16 @@ define void @f2() { ; OPT-LABEL: @f2( -; OPT-NEXT: %1 = call i32 @llvm.amdgcn.lds.kernel.id() -; OPT-NEXT: %v22 = getelementptr inbounds [2 x [1 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 %1, i32 0 -; OPT-NEXT: %2 = load i32, ptr addrspace(4) %v22, align 4 -; OPT-NEXT: %v23 = inttoptr i32 %2 to ptr addrspace(3) -; OPT-NEXT: %ld = load i64, ptr addrspace(3) %v23, align 4 -; OPT-NEXT: %mul = mul i64 %ld, 4 -; OPT-NEXT: %v2 = getelementptr inbounds [2 x [1 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 %1, i32 0 -; OPT-NEXT: %3 = load i32, ptr addrspace(4) %v2, align 4 -; OPT-NEXT: %v21 = inttoptr i32 %3 to ptr addrspace(3) -; OPT-NEXT: store i64 %mul, ptr addrspace(3) %v21, align 4 +; OPT-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() +; OPT-NEXT: [[V22:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0 +; OPT-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[V22]], align 4 +; OPT-NEXT: [[V23:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3) +; OPT-NEXT: [[LD:%.*]] = load i64, ptr addrspace(3) [[V23]], align 4 +; OPT-NEXT: [[MUL:%.*]] = mul i64 [[LD]], 4 +; OPT-NEXT: [[V2:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0 +; OPT-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[V2]], align 4 +; OPT-NEXT: [[V21:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3) +; OPT-NEXT: store i64 [[MUL]], ptr addrspace(3) [[V21]], align 4 ; OPT-NEXT: ret void ; ; GCN-LABEL: f2: @@ -111,9 +110,9 @@ define void @f3() { ; OPT-LABEL: @f3( -; OPT-NEXT: %ld = load i8, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k23.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds, i32 0, i32 1), align 8 -; OPT-NEXT: %mul = mul i8 %ld, 5 -; OPT-NEXT: store i8 %mul, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k23.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds, i32 0, i32 1), align 8 +; OPT-NEXT: [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K23_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds, i32 0, i32 1), align 8 +; OPT-NEXT: [[MUL:%.*]] = mul i8 [[LD]], 5 +; OPT-NEXT: store i8 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K23_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds, i32 0, i32 1), align 8 ; OPT-NEXT: ret void ; ; GCN-LABEL: f3: @@ -136,9 +135,10 @@ ; Doesn't access any via a function, won't be in the lookup table define amdgpu_kernel void @kernel_no_table() { ; OPT-LABEL: @kernel_no_table( -; OPT-NEXT: %ld = load i64, ptr addrspace(3) @llvm.amdgcn.kernel.kernel_no_table.lds, align 8 -; OPT-NEXT: %mul = mul i64 %ld, 8 -; OPT-NEXT: store i64 %mul, ptr addrspace(3) @llvm.amdgcn.kernel.kernel_no_table.lds, align 8 +; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kernel_no_table.lds) ] +; OPT-NEXT: [[LD:%.*]] = load i64, ptr addrspace(3) @llvm.amdgcn.kernel.kernel_no_table.lds, align 8 +; OPT-NEXT: [[MUL:%.*]] = mul i64 [[LD]], 8 +; OPT-NEXT: store i64 [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.kernel_no_table.lds, align 8 ; OPT-NEXT: ret void ; ; GCN-LABEL: kernel_no_table: @@ -159,6 +159,7 @@ ; Access two variables, will allocate those two define amdgpu_kernel void @k01() { ; OPT-LABEL: @k01( +; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k01.lds) ] ; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] ; OPT-NEXT: call void @f0() ; OPT-NEXT: call void @f1() @@ -193,7 +194,7 @@ define amdgpu_kernel void @k23() { ; OPT-LABEL: @k23( -; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds) ] +; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds) ], !alias.scope [[META4:![0-9]+]], !noalias [[META7:![0-9]+]] ; OPT-NEXT: call void @f2() ; OPT-NEXT: call void @f3() ; OPT-NEXT: ret void @@ -231,12 +232,12 @@ ; Access and allocate three variables define amdgpu_kernel void @k123() { ; OPT-LABEL: @k123( -; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds) ] +; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds) ], !alias.scope [[META10:![0-9]+]], !noalias [[META13:![0-9]+]] ; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] ; OPT-NEXT: call void @f1() -; OPT-NEXT: %ld = load i8, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k123.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope !5, !noalias !8 -; OPT-NEXT: %mul = mul i8 %ld, 8 -; OPT-NEXT: store i8 %mul, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k123.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope !5, !noalias !8 +; OPT-NEXT: [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope [[META13]], !noalias [[META10]] +; OPT-NEXT: [[MUL:%.*]] = mul i8 [[LD]], 8 +; OPT-NEXT: store i8 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope [[META13]], !noalias [[META10]] ; OPT-NEXT: call void @f2() ; OPT-NEXT: ret void ; @@ -284,22 +285,25 @@ !2 = !{i32 1} -;. ; OPT: attributes #0 = { "amdgpu-elide-module-lds" } ; OPT: attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) } ; OPT: attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -;. + ; OPT: !0 = !{i64 0, i64 1} ; OPT: !1 = !{i64 4, i64 5} ; OPT: !2 = !{i64 8, i64 9} ; OPT: !3 = !{i32 1} -; OPT: !4 = !{i32 0} -; OPT: !5 = !{!6} -; OPT: !6 = distinct !{!6, !7} -; OPT: !7 = distinct !{!7} -; OPT: !8 = !{!9} -; OPT: !9 = distinct !{!9, !7} -;. +; OPT: !4 = !{!5} +; OPT: !5 = distinct !{!5, !6} +; OPT: !6 = distinct !{!6} +; OPT: !7 = !{!8} +; OPT: !8 = distinct !{!8, !6} +; OPT: !9 = !{i32 0} +; OPT: !10 = !{!11} +; OPT: !11 = distinct !{!11, !12} +; OPT: !12 = distinct !{!12} +; OPT: !13 = !{!14} +; OPT: !14 = distinct !{!14, !12} ; Table size length number-kernels * number-variables * sizeof(uint16_t) ; GCN: .type llvm.amdgcn.lds.offset.table,@object diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll @@ -196,6 +196,7 @@ ; Doesn't access any via a function, won't be in the lookup table define amdgpu_kernel void @kernel_no_table() { ; OPT-LABEL: @kernel_no_table() { +; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kernel_no_table.lds) ] ; OPT-NEXT: [[LD:%.*]] = load i64, ptr addrspace(3) @llvm.amdgcn.kernel.kernel_no_table.lds, align 8 ; OPT-NEXT: [[MUL:%.*]] = mul i64 [[LD]], 8 ; OPT-NEXT: store i64 [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.kernel_no_table.lds, align 8 @@ -256,7 +257,7 @@ } define amdgpu_kernel void @k23() { -; OPT-LABEL: @k23() !llvm.amdgcn.lds.kernel.id !2 { +; OPT-LABEL: @k23() !llvm.amdgcn.lds.kernel.id !7 { ; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds) ] ; OPT-NEXT: call void @f2() ; OPT-NEXT: call void @f3() @@ -295,12 +296,12 @@ ; Access and allocate three variables define amdgpu_kernel void @k123() { -; OPT-LABEL: @k123() !llvm.amdgcn.lds.kernel.id !3 { +; OPT-LABEL: @k123() !llvm.amdgcn.lds.kernel.id !13 { ; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds) ] ; OPT-NEXT: call void @f1() -; OPT-NEXT: [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 2, !alias.scope !4, !noalias !7 +; OPT-NEXT: [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 2, !alias.scope !20, !noalias !21 ; OPT-NEXT: [[MUL:%.*]] = mul i8 [[LD]], 8 -; OPT-NEXT: store i8 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 2, !alias.scope !4, !noalias !7 +; OPT-NEXT: store i8 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 2, !alias.scope !20, !noalias !21 ; OPT-NEXT: call void @f2() ; OPT-NEXT: ret void ; diff --git a/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll b/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll --- a/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll @@ -20,13 +20,13 @@ define amdgpu_kernel void @simple_barrier(ptr addrspace(1) %arg) { ; CHECK-LABEL: @simple_barrier( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0 +; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber [[META0:![0-9]+]] ; CHECK-NEXT: fence syncscope("workgroup") release ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: fence syncscope("workgroup") acquire ; CHECK-NEXT: tail call void @llvm.amdgcn.wave.barrier() -; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform !0 -; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4, !amdgpu.noclobber !0 +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform [[META0]] +; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4, !amdgpu.noclobber [[META0]] ; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]] ; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2 ; CHECK-NEXT: store i32 [[I3]], ptr addrspace(1) [[I4]], align 4 @@ -58,17 +58,17 @@ define amdgpu_kernel void @memory_phi_no_clobber(ptr addrspace(1) %arg) { ; CHECK-LABEL: @memory_phi_no_clobber( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0 -; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0 +; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber [[META0]] +; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform [[META0]] ; CHECK: if.then: ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br label [[IF_END:%.*]], !amdgpu.uniform !0 +; CHECK-NEXT: br label [[IF_END:%.*]], !amdgpu.uniform [[META0]] ; CHECK: if.else: ; CHECK-NEXT: fence syncscope("workgroup") release -; CHECK-NEXT: br label [[IF_END]], !amdgpu.uniform !0 +; CHECK-NEXT: br label [[IF_END]], !amdgpu.uniform [[META0]] ; CHECK: if.end: -; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform !0 -; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4, !amdgpu.noclobber !0 +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform [[META0]] +; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4, !amdgpu.noclobber [[META0]] ; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]] ; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2 ; CHECK-NEXT: store i32 [[I3]], ptr addrspace(1) [[I4]], align 4 @@ -104,17 +104,17 @@ define amdgpu_kernel void @memory_phi_clobber1(ptr addrspace(1) %arg) { ; CHECK-LABEL: @memory_phi_clobber1( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0 -; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0 +; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber [[META0]] +; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform [[META0]] ; CHECK: if.then: ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 3 ; CHECK-NEXT: store i32 1, ptr addrspace(1) [[GEP]], align 4 -; CHECK-NEXT: br label [[IF_END:%.*]], !amdgpu.uniform !0 +; CHECK-NEXT: br label [[IF_END:%.*]], !amdgpu.uniform [[META0]] ; CHECK: if.else: ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br label [[IF_END]], !amdgpu.uniform !0 +; CHECK-NEXT: br label [[IF_END]], !amdgpu.uniform [[META0]] ; CHECK: if.end: -; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform !0 +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform [[META0]] ; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4 ; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]] ; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2 @@ -152,17 +152,17 @@ define amdgpu_kernel void @memory_phi_clobber2(ptr addrspace(1) %arg) { ; CHECK-LABEL: @memory_phi_clobber2( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0 -; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0 +; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber [[META0]] +; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform [[META0]] ; CHECK: if.then: ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br label [[IF_END:%.*]], !amdgpu.uniform !0 +; CHECK-NEXT: br label [[IF_END:%.*]], !amdgpu.uniform [[META0]] ; CHECK: if.else: ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 3 ; CHECK-NEXT: store i32 1, ptr addrspace(1) [[GEP]], align 4 -; CHECK-NEXT: br label [[IF_END]], !amdgpu.uniform !0 +; CHECK-NEXT: br label [[IF_END]], !amdgpu.uniform [[META0]] ; CHECK: if.end: -; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform !0 +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform [[META0]] ; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4 ; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]] ; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2 @@ -199,16 +199,16 @@ define amdgpu_kernel void @no_clobbering_loop1(ptr addrspace(1) %arg, i1 %cc) { ; CHECK-LABEL: @no_clobbering_loop1( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0 -; CHECK-NEXT: br label [[WHILE_COND:%.*]], !amdgpu.uniform !0 +; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber [[META0]] +; CHECK-NEXT: br label [[WHILE_COND:%.*]], !amdgpu.uniform [[META0]] ; CHECK: while.cond: -; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform !0 -; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4, !amdgpu.noclobber !0 +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform [[META0]] +; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4, !amdgpu.noclobber [[META0]] ; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]] ; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2 ; CHECK-NEXT: store i32 [[I3]], ptr addrspace(1) [[I4]], align 4 ; CHECK-NEXT: tail call void @llvm.amdgcn.wave.barrier() -; CHECK-NEXT: br i1 [[CC:%.*]], label [[WHILE_COND]], label [[END:%.*]], !amdgpu.uniform !0 +; CHECK-NEXT: br i1 [[CC:%.*]], label [[WHILE_COND]], label [[END:%.*]], !amdgpu.uniform [[META0]] ; CHECK: end: ; CHECK-NEXT: ret void ; @@ -237,18 +237,18 @@ define amdgpu_kernel void @no_clobbering_loop2(ptr addrspace(1) noalias %arg, ptr addrspace(1) noalias %out, i32 %n) { ; CHECK-LABEL: @no_clobbering_loop2( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0 -; CHECK-NEXT: br label [[WHILE_COND:%.*]], !amdgpu.uniform !0 +; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber [[META0]] +; CHECK-NEXT: br label [[WHILE_COND:%.*]], !amdgpu.uniform [[META0]] ; CHECK: while.cond: ; CHECK-NEXT: [[C:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[INC:%.*]], [[WHILE_COND]] ] ; CHECK-NEXT: [[ACC:%.*]] = phi i32 [ [[I]], [[BB]] ], [ [[I3:%.*]], [[WHILE_COND]] ] -; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i32 [[C]], !amdgpu.uniform !0 -; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4, !amdgpu.noclobber !0 +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i32 [[C]], !amdgpu.uniform [[META0]] +; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4, !amdgpu.noclobber [[META0]] ; CHECK-NEXT: [[I3]] = add i32 [[I2]], [[ACC]] ; CHECK-NEXT: tail call void @llvm.amdgcn.wave.barrier() ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[C]], 1 ; CHECK-NEXT: [[CC:%.*]] = icmp eq i32 [[INC]], [[N:%.*]] -; CHECK-NEXT: br i1 [[CC]], label [[WHILE_COND]], label [[END:%.*]], !amdgpu.uniform !0 +; CHECK-NEXT: br i1 [[CC]], label [[WHILE_COND]], label [[END:%.*]], !amdgpu.uniform [[META0]] ; CHECK: end: ; CHECK-NEXT: store i32 [[I3]], ptr addrspace(1) [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void @@ -280,16 +280,16 @@ define amdgpu_kernel void @clobbering_loop(ptr addrspace(1) %arg, ptr addrspace(1) %out, i1 %cc) { ; CHECK-LABEL: @clobbering_loop( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0 -; CHECK-NEXT: br label [[WHILE_COND:%.*]], !amdgpu.uniform !0 +; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber [[META0]] +; CHECK-NEXT: br label [[WHILE_COND:%.*]], !amdgpu.uniform [[META0]] ; CHECK: while.cond: -; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform !0 +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform [[META0]] ; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4 ; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]] ; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT:%.*]], i64 1 ; CHECK-NEXT: store i32 [[I3]], ptr addrspace(1) [[I4]], align 4 ; CHECK-NEXT: tail call void @llvm.amdgcn.wave.barrier() -; CHECK-NEXT: br i1 [[CC:%.*]], label [[WHILE_COND]], label [[END:%.*]], !amdgpu.uniform !0 +; CHECK-NEXT: br i1 [[CC:%.*]], label [[WHILE_COND]], label [[END:%.*]], !amdgpu.uniform [[META0]] ; CHECK: end: ; CHECK-NEXT: ret void ; @@ -318,10 +318,10 @@ define amdgpu_kernel void @clobber_by_atomic_load(ptr addrspace(1) %arg) { ; CHECK-LABEL: @clobber_by_atomic_load( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0 -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2, !amdgpu.uniform !0 -; CHECK-NEXT: [[VAL:%.*]] = load atomic i32, ptr addrspace(1) [[GEP]] seq_cst, align 4, !amdgpu.noclobber !0 -; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 3, !amdgpu.uniform !0 +; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber [[META0]] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2, !amdgpu.uniform [[META0]] +; CHECK-NEXT: [[VAL:%.*]] = load atomic i32, ptr addrspace(1) [[GEP]] seq_cst, align 4, !amdgpu.noclobber [[META0]] +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 3, !amdgpu.uniform [[META0]] ; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4 ; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]] ; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 4 @@ -353,7 +353,7 @@ ; CHECK-NEXT: fence syncscope("workgroup") release ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: fence syncscope("workgroup") acquire -; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber !0 +; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber [[META0]] ; CHECK-NEXT: store i32 [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; @@ -406,7 +406,7 @@ ; CHECK-NEXT: fence syncscope("workgroup") release ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: fence syncscope("workgroup") acquire -; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber !0 +; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber [[META0]] ; CHECK-NEXT: store i32 [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; @@ -429,7 +429,7 @@ ; CHECK-LABEL: @no_alias_atomic_rmw_relaxed( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[UNUSED:%.*]] = atomicrmw add ptr addrspace(3) @LDS, i32 5 monotonic, align 4 -; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber !0 +; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber [[META0]] ; CHECK-NEXT: store i32 [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; @@ -452,7 +452,7 @@ ; CHECK-NEXT: fence syncscope("workgroup") release ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: fence syncscope("workgroup") acquire -; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber !0 +; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber [[META0]] ; CHECK-NEXT: store i32 [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; @@ -478,7 +478,7 @@ ; CHECK-NEXT: fence syncscope("workgroup") release ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: fence syncscope("workgroup") acquire -; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber !0 +; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber [[META0]] ; CHECK-NEXT: store i32 [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; @@ -587,7 +587,7 @@ ; CHECK-NEXT: fence syncscope("workgroup") release ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: fence syncscope("workgroup") acquire -; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber !0 +; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber [[META0]] ; CHECK-NEXT: store i32 [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ;