diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -216,16 +216,6 @@ std::vector KernelUsedVariables = AMDGPU::findLDSVariablesToLower(M, &F); - // Replace all constant uses with instructions if they belong to the - // current kernel. Unnecessary, removing will cause test churn. - for (GlobalVariable *GV : KernelUsedVariables) { - for (User *U : make_early_inc_range(GV->users())) { - if (ConstantExpr *C = dyn_cast(U)) - AMDGPU::replaceConstantUsesInFunction(C, &F); - } - GV->removeDeadConstantUsers(); - } - if (!KernelUsedVariables.empty()) { std::string VarName = (Twine("llvm.amdgcn.kernel.") + F.getName() + ".lds").str(); diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -1002,7 +1002,7 @@ ; CI: ; %bb.0: ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: ds_read_b128 v[0:3], v0 +; CI-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 @@ -1012,16 +1012,27 @@ ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: load_misaligned64_constant_offsets: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: ds_read_b128 v[0:3], v4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] -; GFX9-NEXT: s_endpgm +; GFX9-ALIGNED-LABEL: load_misaligned64_constant_offsets: +; GFX9-ALIGNED: ; %bb.0: +; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-ALIGNED-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-ALIGNED-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-ALIGNED-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-ALIGNED-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX9-ALIGNED-NEXT: s_endpgm +; +; GFX9-UNALIGNED-LABEL: load_misaligned64_constant_offsets: +; GFX9-UNALIGNED: ; %bb.0: +; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-UNALIGNED-NEXT: ds_read_b128 v[0:3], v4 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-UNALIGNED-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-UNALIGNED-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-UNALIGNED-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX9-UNALIGNED-NEXT: s_endpgm %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 %sum = add i64 %val0, %val1 diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -816,22 +816,31 @@ define amdgpu_kernel void @store_misaligned64_constant_offsets() { ; CI-LABEL: store_misaligned64_constant_offsets: ; CI: ; %bb.0: -; CI-NEXT: v_mov_b32_e32 v0, 0x7b -; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: v_mov_b32_e32 v2, v0 -; CI-NEXT: v_mov_b32_e32 v3, v1 +; CI-NEXT: s_mov_b64 s[0:1], 0x7b +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v2, 0 +; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: ds_write_b128 v1, v[0:3] +; CI-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset1:1 ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: store_misaligned64_constant_offsets: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: ds_write_b128 v1, v[0:3] -; GFX9-NEXT: s_endpgm +; GFX9-ALIGNED-LABEL: store_misaligned64_constant_offsets: +; GFX9-ALIGNED: ; %bb.0: +; GFX9-ALIGNED-NEXT: s_mov_b64 s[0:1], 0x7b +; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-ALIGNED-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset1:1 +; GFX9-ALIGNED-NEXT: s_endpgm +; +; GFX9-UNALIGNED-LABEL: store_misaligned64_constant_offsets: +; GFX9-UNALIGNED: ; %bb.0: +; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-UNALIGNED-NEXT: ds_write_b128 v1, v[0:3] +; GFX9-UNALIGNED-NEXT: s_endpgm store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 ret void diff --git a/llvm/test/CodeGen/AMDGPU/loop_break.ll b/llvm/test/CodeGen/AMDGPU/loop_break.ll --- a/llvm/test/CodeGen/AMDGPU/loop_break.ll +++ b/llvm/test/CodeGen/AMDGPU/loop_break.ll @@ -209,7 +209,11 @@ ; GCN-NEXT: ; implicit-def: $sgpr6 ; GCN-NEXT: .LBB2_1: ; %bb1 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: s_or_b64 s[4:5], s[4:5], exec +; GCN-NEXT: s_cmp_lg_u32 lds@abs32@lo, 4 +; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: s_andn2_b64 s[4:5], s[4:5], exec +; GCN-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GCN-NEXT: s_cmp_gt_i32 s6, -1 ; GCN-NEXT: s_cbranch_scc1 .LBB2_3 ; GCN-NEXT: ; %bb.2: ; %bb4 diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll --- a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll @@ -24,9 +24,7 @@ ;. define amdgpu_kernel void @k0(i64 %x) { ; CHECK-LABEL: @k0( -; CHECK-NEXT: %1 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0), i32 0, i32 0 -; CHECK-NEXT: %2 = addrspacecast i8 addrspace(3)* %1 to i8* -; CHECK-NEXT: %ptr = getelementptr inbounds i8, i8* %2, i64 %x +; CHECK-NEXT: %ptr = getelementptr inbounds i8, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(3)* @lds.1, i32 0, i32 0) to i8*), i64 %x ; CHECK-NEXT: store i8 1, i8* %ptr, align 1 ; CHECK-NEXT: ret void ; @@ -37,9 +35,7 @@ define amdgpu_kernel void @k1(i64 %x) { ; CHECK-LABEL: @k1( -; CHECK-NEXT: %1 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0), i32 0, i32 0 -; CHECK-NEXT: %2 = addrspacecast i8 addrspace(3)* %1 to i8* -; CHECK-NEXT: %ptr = getelementptr inbounds i8, i8* %2, i64 %x +; CHECK-NEXT: %ptr = getelementptr inbounds i8, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(3)* @lds.1, i32 0, i32 0) to i8*), i64 %x ; CHECK-NEXT: store i8 1, i8* %ptr, align 1 ; CHECK-NEXT: ret void ; @@ -71,14 +67,10 @@ ; Use constant twice from the same kernel but a different other constant. define amdgpu_kernel void @k3(i64 %x) { ; CHECK-LABEL: @k3( -; CHECK-NEXT: %1 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k3.lds.t, %llvm.amdgcn.kernel.k3.lds.t addrspace(3)* @llvm.amdgcn.kernel.k3.lds, i32 0, i32 0), i32 0, i32 16 -; CHECK-NEXT: %2 = bitcast i8 addrspace(3)* %1 to i64 addrspace(3)* -; CHECK-NEXT: %ptr1 = addrspacecast i64 addrspace(3)* %2 to i64* +; CHECK-NEXT: %ptr1 = addrspacecast i64 addrspace(3)* bitcast (i8 addrspace(3)* getelementptr inbounds ([32 x i8], [32 x i8] addrspace(3)* @lds.3, i32 0, i32 16) to i64 addrspace(3)*) to i64* ; CHECK-NEXT: store i64 1, i64* %ptr1, align 1 -; CHECK-NEXT: %3 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k3.lds.t, %llvm.amdgcn.kernel.k3.lds.t addrspace(3)* @llvm.amdgcn.kernel.k3.lds, i32 0, i32 0), i32 0, i32 24 -; CHECK-NEXT: %4 = bitcast i8 addrspace(3)* %3 to i64 addrspace(3)* -; CHECK-NEXT: %ptr2 = addrspacecast i64 addrspace(3)* %4 to i64* -; CHECK-NEXT: store i64 2, i64* %ptr2, align 8 +; CHECK-NEXT: %ptr2 = addrspacecast i64 addrspace(3)* bitcast (i8 addrspace(3)* getelementptr inbounds ([32 x i8], [32 x i8] addrspace(3)* @lds.3, i32 0, i32 24) to i64 addrspace(3)*) to i64* +; CHECK-NEXT: store i64 2, i64* %ptr2, align 1 ; CHECK-NEXT: ret void ; %ptr1 = addrspacecast i64 addrspace(3)* bitcast (i8 addrspace(3)* getelementptr inbounds ([32 x i8], [32 x i8] addrspace(3)* @lds.3, i32 0, i32 16) to i64 addrspace(3)*) to i64* @@ -91,9 +83,7 @@ ; @lds.1 is used from constant expressions in different kernels. define amdgpu_kernel void @k4(i64 %x) { ; CHECK-LABEL: @k4( -; CHECK-NEXT: %1 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k4.lds.t, %llvm.amdgcn.kernel.k4.lds.t addrspace(3)* @llvm.amdgcn.kernel.k4.lds, i32 0, i32 0), i32 0, i32 0 -; CHECK-NEXT: %2 = addrspacecast i8 addrspace(3)* %1 to i8* -; CHECK-NEXT: %ptr = getelementptr inbounds i8, i8* %2, i64 %x +; CHECK-NEXT: %ptr = getelementptr inbounds i8, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(3)* @lds.1, i32 0, i32 0) to i8*), i64 %x ; CHECK-NEXT: store i8 1, i8* %ptr, align 1 ; CHECK-NEXT: ret void ; @@ -107,9 +97,8 @@ ; Multiple constexpr use in a same instruction. define amdgpu_kernel void @k5() { ; CHECK-LABEL: @k5( -; CHECK-NEXT: %1 = addrspacecast [505 x i32] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k5.lds.t, %llvm.amdgcn.kernel.k5.lds.t addrspace(3)* @llvm.amdgcn.kernel.k5.lds, i32 0, i32 0) to [505 x i32]* -; CHECK-NEXT: %2 = getelementptr inbounds [505 x i32], [505 x i32]* %1, i64 0, i64 0 -; CHECK-NEXT: call void undef(i32* %2, i32* %2) +; CHECK-NEXT: call void undef(i32* getelementptr inbounds ([505 x i32], [505 x i32]* addrspacecast ([505 x i32] addrspace(3)* @lds.4 to [505 x i32]*), i64 0, i64 0), i32* getelementptr inbounds ([505 x i32], [505 x i32]* addrspacecast ([505 x i32] addrspace(3)* @lds.4 to [505 x i32]*), i64 0, i64 0)) +; CHECK-NEXT: ret void ; call void undef(i32* getelementptr inbounds ([505 x i32], [505 x i32]* addrspacecast ([505 x i32] addrspace(3)* @lds.4 to [505 x i32]*), i64 0, i64 0), i32* getelementptr inbounds ([505 x i32], [505 x i32]* addrspacecast ([505 x i32] addrspace(3)* @lds.4 to [505 x i32]*), i64 0, i64 0)) ret void @@ -122,10 +111,8 @@ ; expression operands of store should be replaced by corresponding instruction sequence. define amdgpu_kernel void @k6() { ; CHECK-LABEL: @k6( -; CHECK-NEXT: %1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k6.lds.t, %llvm.amdgcn.kernel.k6.lds.t addrspace(3)* @llvm.amdgcn.kernel.k6.lds, i32 0, i32 0), i32 0, i32 2 -; CHECK-NEXT: %2 = ptrtoint i32 addrspace(3)* %1 to i32 -; CHECK-NEXT: store i32 %2, i32 addrspace(3)* %1, align 8 -; CHECK-NEXT: ret void +; CHECK-NEXT: store i32 ptrtoint (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @lds.5, i32 0, i32 2) to i32), i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @lds.5, i32 0, i32 2), align 4 +; CHECK-NEXT: ret void ; store i32 ptrtoint (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @lds.5, i32 0, i32 2) to i32), i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @lds.5, i32 0, i32 2) ret void diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll --- a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll @@ -28,9 +28,7 @@ ; SUPER-ALIGN_OFF: @llvm.amdgcn.kernel.k4.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k4.lds.t undef, align 4 ; CHECK-LABEL: @k1 -; CHECK: %1 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0), i32 0, i32 0 -; CHECK: %2 = addrspacecast i8 addrspace(3)* %1 to i8* -; CHECK: %ptr = getelementptr inbounds i8, i8* %2, i64 %x +; CHECK: %ptr = getelementptr inbounds i8, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([32 x i8], [32 x i8] addrspace(3)* @lds.1, i32 0, i32 0) to i8*), i64 %x ; CHECK: store i8 1, i8* %ptr, align 1 define amdgpu_kernel void @k1(i64 %x) { %ptr = getelementptr inbounds i8, i8* addrspacecast ([32 x i8] addrspace(3)* @lds.1 to i8*), i64 %x @@ -58,7 +56,14 @@ ; Check that alignment is propagated to uses for arrays. ; CHECK-LABEL: @k3 -; CHECK: store i32 1, i32 addrspace(3)* %ptr1, align 8 +; CHECK: %ptr0 = getelementptr inbounds i64, i64 addrspace(3)* getelementptr inbounds ([32 x i64], [32 x i64] addrspace(3)* @lds.4, i32 0, i32 0), i64 0 +; CHECK: store i64 0, i64 addrspace(3)* %ptr0, align 8 +; CHECK: %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(3)* @lds.5, i32 0, i32 0), i64 2 +; CHECK: %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(3)* @lds.5, i32 0, i32 0), i64 3 +; CHECK: %ptr3 = getelementptr inbounds i32, i32 addrspace(3)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(3)* @lds.5, i32 0, i32 0), i64 4 +; CHECK: %ptr4 = getelementptr inbounds i32, i32 addrspace(3)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(3)* @lds.5, i32 0, i32 0), i64 5 +; CHECK: %ptr5 = getelementptr inbounds i32, i32 addrspace(3)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(3)* @lds.5, i32 0, i32 0), i64 %x +; CHECK: store i32 1, i32 addrspace(3)* %ptr1, align 4 ; CHECK: store i32 2, i32 addrspace(3)* %ptr2, align 4 ; SUPER-ALIGN_ON: store i32 3, i32 addrspace(3)* %ptr3, align 16 ; SUPER-ALIGN_OFF: store i32 3, i32 addrspace(3)* %ptr3, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll --- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll @@ -5,11 +5,10 @@ @b = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4 ; CHECK-LABEL: @no_clobber_ds_load_stores_x2_preexisting_aa -; CHECK: store i32 1, i32 addrspace(3)* %0, align 16, !tbaa !0, !noalias !5 -; CHECK: %val.a = load i32, i32 addrspace(3)* %gep.a, align 4, !tbaa !0, !noalias !5 -; CHECK: store i32 2, i32 addrspace(3)* %1, align 16, !tbaa !0, !noalias !5 -; CHECK: %val.b = load i32, i32 addrspace(3)* %gep.b, align 4, !tbaa !0, !noalias !5 - +; CHECK: store i32 1, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 0), align 4, !tbaa !0, !alias.scope !5, !noalias !8 +; CHECK: %val.a = load i32, i32 addrspace(3)* %gep.a, align 4, !tbaa !0, !noalias !10 +; CHECK: store i32 2, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 0), align 4, !tbaa !0, !alias.scope !8, !noalias !5 +; CHECK: %val.b = load i32, i32 addrspace(3)* %gep.b, align 4, !tbaa !0, !noalias !10 define amdgpu_kernel void @no_clobber_ds_load_stores_x2_preexisting_aa(i32 addrspace(1)* %arg, i32 %i) { bb: store i32 1, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 0), align 4, !alias.scope !0, !noalias !3, !tbaa !5 @@ -39,4 +38,9 @@ ; CHECK:!2 = !{!"int", !3, i64 0} ; CHECK:!3 = !{!"omnipotent char", !4, i64 0} ; CHECK:!4 = !{!"Simple C++ TBAA"} -; CHECK:!5 = !{} +; CHECK:!5 = !{!6} +; CHECK:!6 = distinct !{!6, !7} +; CHECK:!7 = distinct !{!7} +; CHECK:!8 = !{!9} +; CHECK:!9 = distinct !{!9, !7} +; CHECK:!10 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll --- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll @@ -11,17 +11,14 @@ ; are not adjacent. They are only moved later by MachineScheduler. ; GCN-LABEL: {{^}}no_clobber_ds_load_stores_x2: -; GCN: ds_write_b32 -; GCN: ds_write_b32 -; GCN: ds_read_b32 -; GCN: ds_read_b32 +; GCN: ds_read2st64_b32 +; GCN: ds_write2st64 ; CHECK-LABEL: @no_clobber_ds_load_stores_x2 -; CHECK: store i32 1, i32 addrspace(3)* %0, align 16, !alias.scope !0, !noalias !3 +; CHECK: store i32 1, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 0), align 4 ; CHECK: %val.a = load i32, i32 addrspace(3)* %gep.a, align 4, !alias.scope !0, !noalias !3 -; CHECK: store i32 2, i32 addrspace(3)* %1, align 16, !alias.scope !3, !noalias !0 +; CHECK: store i32 2, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 0), align 4 ; CHECK: %val.b = load i32, i32 addrspace(3)* %gep.b, align 4, !alias.scope !3, !noalias !0 - define amdgpu_kernel void @no_clobber_ds_load_stores_x2(i32 addrspace(1)* %arg, i32 %i) { bb: store i32 1, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 0), align 4 @@ -37,20 +34,17 @@ ; GCN-LABEL: {{^}}no_clobber_ds_load_stores_x3: ; GCN-DAG: ds_write_b32 -; GCN-DAG: ds_write_b32 -; GCN-DAG: ds_write_b32 -; GCN-DAG: ds_read_b32 -; GCN-DAG: ds_read_b32 +; GCN-DAG: ds_read2st64_b32 ; GCN-DAG: ds_read_b32 +; GCN-DAG: ds_write2st64_b32 ; CHECK-LABEL: @no_clobber_ds_load_stores_x3 -; CHECK: store i32 1, i32 addrspace(3)* %0, align 16, !alias.scope !5, !noalias !8 -; CHECK: %val.a = load i32, i32 addrspace(3)* %gep.a, align 4, !alias.scope !5, !noalias !8 -; CHECK: store i32 2, i32 addrspace(3)* %1, align 16, !alias.scope !11, !noalias !12 -; CHECK: %val.b = load i32, i32 addrspace(3)* %gep.b, align 4, !alias.scope !11, !noalias !12 -; CHECK: store i32 3, i32 addrspace(3)* %2, align 16, !alias.scope !13, !noalias !14 -; CHECK: %val.c = load i32, i32 addrspace(3)* %gep.c, align 4, !alias.scope !13, !noalias !14 - +; CHECK: store i32 1, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 0), align 4 +; CHECK: %val.a = load i32, i32 addrspace(3)* %gep.a, align 4, !alias.scope !5, !noalias !8 +; CHECK: store i32 2, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 0), align 4 +; CHECK: %val.b = load i32, i32 addrspace(3)* %gep.b, align 4, !alias.scope !11, !noalias !12 +; CHECK: store i32 3, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @c, i32 0, i32 0), align 4 +; CHECK: %val.c = load i32, i32 addrspace(3)* %gep.c, align 4, !alias.scope !13, !noalias !14 define amdgpu_kernel void @no_clobber_ds_load_stores_x3(i32 addrspace(1)* %arg, i32 %i) { bb: store i32 1, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 0), align 4 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll @@ -35,21 +35,11 @@ ret void } -; CHECK-LABEL: @timestwo() #0 -; CHECK-NOT: call void @llvm.donothing() -; CHECK: %1 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)* -; CHECK: %2 = addrspacecast i32 addrspace(3)* %1 to i32* -; CHECK: %3 = ptrtoint i32* %2 to i64 -; CHECK: %4 = add i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64), %3 -; CHECK: %5 = inttoptr i64 %4 to i32* -; CHECK: %ld = load i32, i32* %5, align 4 -; CHECK: %mul = mul i32 %ld, 2 -; CHECK: %6 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)* -; CHECK: %7 = addrspacecast i32 addrspace(3)* %6 to i32* -; CHECK: %8 = ptrtoint i32* %7 to i64 -; CHECK: %9 = add i64 %8, ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64) -; CHECK: %10 = inttoptr i64 %9 to i32* -; CHECK: store i32 %mul, i32* %10, align 4 +; CHECK-LABEL: define amdgpu_kernel void @timestwo() #0 { +; CHECK-NOT: call void @llvm.donothing() +; CHECK: %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 +; CHECK: %mul = mul i32 %ld, 2 +; CHECK: store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 define amdgpu_kernel void @timestwo() { %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 %mul = mul i32 %ld, 2