Index: llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -319,6 +319,10 @@ UsedList.erase(GV); GV->eraseFromParent(); } + + uint64_t Off = DL.getStructLayout(LDSTy)->getElementOffset(I); + Align A = commonAlignment(MaxAlign, Off); + refineUsesAlignment(GEP, A, DL); } // Mark kernels with asm that reads the address of the allocated structure @@ -338,6 +342,46 @@ } return true; } + + void refineUsesAlignment(Value *Ptr, Align A, const DataLayout &DL, + unsigned MaxDepth = 5) { + if (!MaxDepth) + return; + + for (User *U : Ptr->users()) { + if (auto *LI = dyn_cast(U)) { + LI->setAlignment(std::max(A, LI->getAlign())); + continue; + } + if (auto *SI = dyn_cast(U)) { + SI->setAlignment(std::max(A, SI->getAlign())); + continue; + } + if (auto *AI = dyn_cast(U)) { + AI->setAlignment(std::max(A, AI->getAlign())); + continue; + } + if (auto *AI = dyn_cast(U)) { + AI->setAlignment(std::max(A, AI->getAlign())); + continue; + } + if (auto *GEP = dyn_cast(U)) { + unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP->getType()); + APInt Off(BitWidth, 0); + if (GEP->getPointerOperand() == Ptr && + GEP->accumulateConstantOffset(DL, Off)) { + Align GA = commonAlignment(A, Off.getLimitedValue()); + refineUsesAlignment(GEP, GA, DL, MaxDepth - 1); + } + continue; + } + if (auto *I = dyn_cast(U)) { + if (I->getOpcode() == Instruction::BitCast || + I->getOpcode() == Instruction::AddrSpaceCast) + refineUsesAlignment(I, A, DL, MaxDepth - 1); + } + } + } }; } // namespace Index: llvm/test/CodeGen/AMDGPU/ds_read2.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -1009,7 +1009,7 @@ ; CI: ; %bb.0: ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 +; CI-NEXT: ds_read_b128 v[0:3], v0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 @@ -1019,27 +1019,16 @@ ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; GFX9-ALIGNED-LABEL: load_misaligned64_constant_offsets: -; GFX9-ALIGNED: ; %bb.0: -; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-ALIGNED-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-ALIGNED-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-ALIGNED-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-ALIGNED-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] -; GFX9-ALIGNED-NEXT: s_endpgm -; -; GFX9-UNALIGNED-LABEL: load_misaligned64_constant_offsets: -; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-UNALIGNED-NEXT: ds_read_b128 v[0:3], v4 -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-UNALIGNED-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-UNALIGNED-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] -; GFX9-UNALIGNED-NEXT: s_endpgm +; GFX9-LABEL: load_misaligned64_constant_offsets: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: ds_read_b128 v[0:3], v4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 %sum = add i64 %val0, %val1 Index: llvm/test/CodeGen/AMDGPU/ds_write2.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -818,33 +818,22 @@ define amdgpu_kernel void @store_misaligned64_constant_offsets() { ; CI-LABEL: store_misaligned64_constant_offsets: ; CI: ; %bb.0: -; CI-NEXT: s_movk_i32 s0, 0x7b -; CI-NEXT: s_mov_b32 s1, 0 -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v2, 0 -; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v0, 0x7b +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v2, v0 +; CI-NEXT: v_mov_b32_e32 v3, v1 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset1:1 +; CI-NEXT: ds_write_b128 v1, v[0:3] ; CI-NEXT: s_endpgm ; -; GFX9-ALIGNED-LABEL: store_misaligned64_constant_offsets: -; GFX9-ALIGNED: ; %bb.0: -; GFX9-ALIGNED-NEXT: s_movk_i32 s0, 0x7b -; GFX9-ALIGNED-NEXT: s_mov_b32 s1, 0 -; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-ALIGNED-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset1:1 -; GFX9-ALIGNED-NEXT: s_endpgm -; -; GFX9-UNALIGNED-LABEL: store_misaligned64_constant_offsets: -; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x7b -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-UNALIGNED-NEXT: ds_write_b128 v1, v[0:3] -; GFX9-UNALIGNED-NEXT: s_endpgm +; GFX9-LABEL: store_misaligned64_constant_offsets: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: ds_write_b128 v1, v[0:3] +; GFX9-NEXT: s_endpgm store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 ret void Index: llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll +++ llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll @@ -25,11 +25,11 @@ ; CHECK-LABEL: @k0( ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] ; CHECK-NEXT: %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i8 addrspace(3)* -; CHECK-NEXT: store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1 +; CHECK-NEXT: store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 8 ; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 2) to i8 addrspace(3)* -; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2 +; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 4 ; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1) to i8 addrspace(3)* -; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4 +; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 16 ; CHECK-NEXT: %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0) to i8 addrspace(3)* ; CHECK-NEXT: store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16 ; CHECK-NEXT: ret void @@ -53,9 +53,9 @@ ; CHECK-LABEL: @k1( ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] ; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 2) to i8 addrspace(3)* -; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2 +; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 4 ; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1) to i8 addrspace(3)* -; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4 +; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 16 ; CHECK-NEXT: %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0) to i8 addrspace(3)* ; CHECK-NEXT: store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16 ; CHECK-NEXT: ret void @@ -101,9 +101,9 @@ define void @f0() { ; CHECK-LABEL: @f0( ; CHECK-NEXT: %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i8 addrspace(3)* -; CHECK-NEXT: store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1 +; CHECK-NEXT: store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 8 ; CHECK-NEXT: %lds.size.8.align.8.bc = bitcast [8 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to i8 addrspace(3)* -; CHECK-NEXT: store i8 8, i8 addrspace(3)* %lds.size.8.align.8.bc, align 4 +; CHECK-NEXT: store i8 8, i8 addrspace(3)* %lds.size.8.align.8.bc, align 8 ; CHECK-NEXT: ret void ; %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)* Index: llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll +++ llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll @@ -76,7 +76,7 @@ ; CHECK-NEXT: %3 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k3.lds.t, %llvm.amdgcn.kernel.k3.lds.t addrspace(3)* @llvm.amdgcn.kernel.k3.lds, i32 0, i32 0), i32 0, i32 24 ; CHECK-NEXT: %4 = bitcast i8 addrspace(3)* %3 to i64 addrspace(3)* ; CHECK-NEXT: %ptr2 = addrspacecast i64 addrspace(3)* %4 to i64* -; CHECK-NEXT: store i64 2, i64* %ptr2, align 1 +; CHECK-NEXT: store i64 2, i64* %ptr2, align 8 ; CHECK-NEXT: ret void ; %ptr1 = addrspacecast i64 addrspace(3)* bitcast (i8 addrspace(3)* getelementptr inbounds ([32 x i8], [32 x i8] addrspace(3)* @lds.3, i32 0, i32 16) to i64 addrspace(3)*) to i64* Index: llvm/test/CodeGen/AMDGPU/lower-kernel-lds-global-uses.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/lower-kernel-lds-global-uses.ll +++ llvm/test/CodeGen/AMDGPU/lower-kernel-lds-global-uses.ll @@ -41,7 +41,7 @@ @llvm.compiler.used = appending global [3 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @lds.2 to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.4 to i8 addrspace(1)*) to i8*), i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @lds.6 to i8 addrspace(3)*) to i8*)], section "llvm.metadata" ; CHECK-LABEL: @k0() -; CHECK: %ld.lds.1 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1), align 2 +; CHECK: %ld.lds.1 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1), align 4 ; CHECK: %ld.lds.2 = load i32, i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0), align 4 ; CHECK: %ld.lds.3 = load i64, i64 addrspace(3)* @lds.3, align 4 ; CHECK: %ld.lds.4 = load float, float addrspace(3)* @lds.4, align 4 Index: llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll +++ llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll @@ -4,6 +4,8 @@ ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-super-align-lds-globals=false < %s | FileCheck --check-prefixes=CHECK,SUPER-ALIGN_OFF %s ; CHECK: %llvm.amdgcn.kernel.k1.lds.t = type { [32 x i8] } +; CHECK: %llvm.amdgcn.kernel.k2.lds.t = type { i16, [2 x i8], i16 } +; CHECK: %llvm.amdgcn.kernel.k3.lds.t = type { [32 x i64], [32 x i32] } ; CHECK-NOT: @lds.1 @lds.1 = internal unnamed_addr addrspace(3) global [32 x i8] undef, align 1 @@ -11,6 +13,10 @@ ; SUPER-ALIGN_ON: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t undef, align 16 ; SUPER-ALIGN_OFF: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t undef, align 1 +; CHECK: @llvm.amdgcn.kernel.k2.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k2.lds.t undef, align 4 +; SUPER-ALIGN_ON: @llvm.amdgcn.kernel.k3.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k3.lds.t undef, align 16 +; SUPER-ALIGN_OFF: @llvm.amdgcn.kernel.k3.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k3.lds.t undef, align 8 + ; CHECK-LABEL: @k1 ; CHECK: %1 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0), i32 0, i32 0 ; CHECK: %2 = addrspacecast i8 addrspace(3)* %1 to i8* @@ -21,3 +27,103 @@ store i8 1, i8 addrspace(0)* %ptr, align 1 ret void } + +@lds.2 = internal unnamed_addr addrspace(3) global i16 undef, align 4 +@lds.3 = internal unnamed_addr addrspace(3) global i16 undef, align 4 + +; Check that alignment is propagated to uses for scalar variables. + +; CHECK-LABEL: @k2 +; CHECK: store i16 1, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k2.lds.t, %llvm.amdgcn.kernel.k2.lds.t addrspace(3)* @llvm.amdgcn.kernel.k2.lds, i32 0, i32 0), align 4 +; CHECK: store i16 2, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k2.lds.t, %llvm.amdgcn.kernel.k2.lds.t addrspace(3)* @llvm.amdgcn.kernel.k2.lds, i32 0, i32 2), align 4 +define amdgpu_kernel void @k2() { + store i16 1, i16 addrspace(3)* @lds.2, align 2 + store i16 2, i16 addrspace(3)* @lds.3, align 2 + ret void +} + +@lds.4 = internal unnamed_addr addrspace(3) global [32 x i64] undef, align 8 +@lds.5 = internal unnamed_addr addrspace(3) global [32 x i32] undef, align 4 + +; Check that alignment is propagated to uses for arrays. + +; CHECK-LABEL: @k3 +; CHECK: store i32 1, i32 addrspace(3)* %ptr1, align 8 +; CHECK: store i32 2, i32 addrspace(3)* %ptr2, align 4 +; SUPER-ALIGN_ON: store i32 3, i32 addrspace(3)* %ptr3, align 16 +; SUPER-ALIGN_OFF: store i32 3, i32 addrspace(3)* %ptr3, align 8 +; CHECK: store i32 4, i32 addrspace(3)* %ptr4, align 4 +; CHECK: store i32 5, i32 addrspace(3)* %ptr5, align 4 +; CHECK: %load1 = load i32, i32 addrspace(3)* %ptr1, align 8 +; CHECK: %load2 = load i32, i32 addrspace(3)* %ptr2, align 4 +; SUPER-ALIGN_ON: %load3 = load i32, i32 addrspace(3)* %ptr3, align 16 +; SUPER-ALIGN_OFF: %load3 = load i32, i32 addrspace(3)* %ptr3, align 8 +; CHECK: %load4 = load i32, i32 addrspace(3)* %ptr4, align 4 +; CHECK: %load5 = load i32, i32 addrspace(3)* %ptr5, align 4 +; CHECK: %val1 = atomicrmw volatile add i32 addrspace(3)* %ptr1, i32 1 monotonic, align 8 +; CHECK: %val2 = cmpxchg volatile i32 addrspace(3)* %ptr1, i32 1, i32 2 monotonic monotonic, align 8 +; CHECK: %ptr1.bc = bitcast i32 addrspace(3)* %ptr1 to i16 addrspace(3)* +; CHECK: %ptr2.bc = bitcast i32 addrspace(3)* %ptr2 to i16 addrspace(3)* +; CHECK: %ptr3.bc = bitcast i32 addrspace(3)* %ptr3 to i16 addrspace(3)* +; CHECK: %ptr4.bc = bitcast i32 addrspace(3)* %ptr4 to i16 addrspace(3)* +; CHECK: store i16 11, i16 addrspace(3)* %ptr1.bc, align 8 +; CHECK: store i16 12, i16 addrspace(3)* %ptr2.bc, align 4 +; SUPER-ALIGN_ON: store i16 13, i16 addrspace(3)* %ptr3.bc, align 16 +; SUPER-ALIGN_OFF: store i16 13, i16 addrspace(3)* %ptr3.bc, align 8 +; CHECK: store i16 14, i16 addrspace(3)* %ptr4.bc, align 4 +; CHECK: %ptr1.ac = addrspacecast i32 addrspace(3)* %ptr1 to i32* +; CHECK: %ptr2.ac = addrspacecast i32 addrspace(3)* %ptr2 to i32* +; CHECK: %ptr3.ac = addrspacecast i32 addrspace(3)* %ptr3 to i32* +; CHECK: %ptr4.ac = addrspacecast i32 addrspace(3)* %ptr4 to i32* +; CHECK: store i32 21, i32* %ptr1.ac, align 8 +; CHECK: store i32 22, i32* %ptr2.ac, align 4 +; SUPER-ALIGN_ON: store i32 23, i32* %ptr3.ac, align 16 +; SUPER-ALIGN_OFF: store i32 23, i32* %ptr3.ac, align 8 +; CHECK: store i32 24, i32* %ptr4.ac, align 4 +define amdgpu_kernel void @k3(i64 %x) { + %ptr0 = getelementptr inbounds i64, i64 addrspace(3)* bitcast ([32 x i64] addrspace(3)* @lds.4 to i64 addrspace(3)*), i64 0 + store i64 0, i64 addrspace(3)* %ptr0, align 8 + + %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* bitcast ([32 x i32] addrspace(3)* @lds.5 to i32 addrspace(3)*), i64 2 + %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* bitcast ([32 x i32] addrspace(3)* @lds.5 to i32 addrspace(3)*), i64 3 + %ptr3 = getelementptr inbounds i32, i32 addrspace(3)* bitcast ([32 x i32] addrspace(3)* @lds.5 to i32 addrspace(3)*), i64 4 + %ptr4 = getelementptr inbounds i32, i32 addrspace(3)* bitcast ([32 x i32] addrspace(3)* @lds.5 to i32 addrspace(3)*), i64 5 + %ptr5 = getelementptr inbounds i32, i32 addrspace(3)* bitcast ([32 x i32] addrspace(3)* @lds.5 to i32 addrspace(3)*), i64 %x + + store i32 1, i32 addrspace(3)* %ptr1, align 4 + store i32 2, i32 addrspace(3)* %ptr2, align 4 + store i32 3, i32 addrspace(3)* %ptr3, align 4 + store i32 4, i32 addrspace(3)* %ptr4, align 4 + store i32 5, i32 addrspace(3)* %ptr5, align 4 + + %load1 = load i32, i32 addrspace(3)* %ptr1, align 4 + %load2 = load i32, i32 addrspace(3)* %ptr2, align 4 + %load3 = load i32, i32 addrspace(3)* %ptr3, align 4 + %load4 = load i32, i32 addrspace(3)* %ptr4, align 4 + %load5 = load i32, i32 addrspace(3)* %ptr5, align 4 + + %val1 = atomicrmw volatile add i32 addrspace(3)* %ptr1, i32 1 monotonic, align 4 + %val2 = cmpxchg volatile i32 addrspace(3)* %ptr1, i32 1, i32 2 monotonic monotonic, align 4 + + %ptr1.bc = bitcast i32 addrspace(3)* %ptr1 to i16 addrspace(3)* + %ptr2.bc = bitcast i32 addrspace(3)* %ptr2 to i16 addrspace(3)* + %ptr3.bc = bitcast i32 addrspace(3)* %ptr3 to i16 addrspace(3)* + %ptr4.bc = bitcast i32 addrspace(3)* %ptr4 to i16 addrspace(3)* + + store i16 11, i16 addrspace(3)* %ptr1.bc, align 2 + store i16 12, i16 addrspace(3)* %ptr2.bc, align 2 + store i16 13, i16 addrspace(3)* %ptr3.bc, align 2 + store i16 14, i16 addrspace(3)* %ptr4.bc, align 2 + + %ptr1.ac = addrspacecast i32 addrspace(3)* %ptr1 to i32* + %ptr2.ac = addrspacecast i32 addrspace(3)* %ptr2 to i32* + %ptr3.ac = addrspacecast i32 addrspace(3)* %ptr3 to i32* + %ptr4.ac = addrspacecast i32 addrspace(3)* %ptr4 to i32* + + store i32 21, i32* %ptr1.ac, align 4 + store i32 22, i32* %ptr2.ac, align 4 + store i32 23, i32* %ptr3.ac, align 4 + store i32 24, i32* %ptr4.ac, align 4 + + ret void +} Index: llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll +++ llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll @@ -18,11 +18,11 @@ define amdgpu_kernel void @k0() { ; CHECK-LABEL: @k0( ; CHECK-NEXT: %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 3) to i8 addrspace(3)* -; CHECK-NEXT: store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1 +; CHECK-NEXT: store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 2 ; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 2) to i8 addrspace(3)* -; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2 +; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 4 ; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1) to i8 addrspace(3)* -; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4 +; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 16 ; CHECK-NEXT: %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0) to i8 addrspace(3)* ; CHECK-NEXT: store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16 ; CHECK-NEXT: ret void @@ -45,9 +45,9 @@ define amdgpu_kernel void @k1() { ; CHECK-LABEL: @k1( ; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 2) to i8 addrspace(3)* -; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2 +; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 4 ; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1) to i8 addrspace(3)* -; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4 +; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 16 ; CHECK-NEXT: %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0) to i8 addrspace(3)* ; CHECK-NEXT: store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16 ; CHECK-NEXT: ret void Index: llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll +++ llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll @@ -29,7 +29,7 @@ @llvm.compiler.used = appending global [2 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (float addrspace(3)* @tolower to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored to i8 addrspace(1)*) to i8*)], section "llvm.metadata" ; CHECK-LABEL: @func() -; CHECK: %dec = atomicrmw fsub float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), float 1.000000e+00 monotonic, align 4 +; CHECK: %dec = atomicrmw fsub float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), float 1.000000e+00 monotonic, align 8 define void @func() { %dec = atomicrmw fsub float addrspace(3)* @tolower, float 1.0 monotonic %unused0 = atomicrmw add i64 addrspace(1)* @ignored, i64 1 monotonic Index: llvm/test/CodeGen/AMDGPU/lower-module-lds.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/lower-module-lds.ll +++ llvm/test/CodeGen/AMDGPU/lower-module-lds.ll @@ -24,9 +24,9 @@ ; Use in func rewritten to access struct at address zero ; CHECK-LABEL: @func() ; CHECK: %dec = atomicrmw fsub float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), float 1.0 -; CHECK: %val0 = load i32, i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 4 +; CHECK: %val0 = load i32, i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 8 ; CHECK: %val1 = add i32 %val0, 4 -; CHECK: store i32 %val1, i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 4 +; CHECK: store i32 %val1, i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 8 ; CHECK: %unused0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 monotonic define void @func() { %dec = atomicrmw fsub float addrspace(3)* @var0, float 1.0 monotonic @@ -41,7 +41,7 @@ ; CHECK-LABEL: @kern_call() ; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] ; CHECK: call void @func() -; CHECK: %dec = atomicrmw fsub float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), float 2.000000e+00 monotonic, align 4 +; CHECK: %dec = atomicrmw fsub float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), float 2.000000e+00 monotonic, align 8 define amdgpu_kernel void @kern_call() { call void @func() %dec = atomicrmw fsub float addrspace(3)* @var0, float 2.0 monotonic Index: llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-lds-offsets.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-lds-offsets.ll +++ llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-lds-offsets.ll @@ -73,7 +73,7 @@ ; LOWER_LDS-LABEL: @f1 -; LOWER_LDS: %1 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1), align 2 +; LOWER_LDS: %1 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1), align 16 ; LOWER_LDS: %2 = getelementptr i8, i8 addrspace(3)* null, i16 %1 ; LOWER_LDS: %3 = bitcast i8 addrspace(3)* %2 to i32 addrspace(3)* ; LOWER_LDS: store i32 7, i32 addrspace(3)* %3, align 4 @@ -153,7 +153,7 @@ ; LOWER_LDS: %4 = ptrtoint i64 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0) to i16 ; LOWER_LDS: store i16 %4, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 2 ; LOWER_LDS: %5 = ptrtoint i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1) to i16 -; LOWER_LDS: store i16 %5, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1), align 2 +; LOWER_LDS: store i16 %5, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1), align 16 ; LOWER_LDS: br label %6 ; ; LOWER_LDS-LABEL: 6: @@ -177,11 +177,9 @@ ; GCN: s_mov_b32 s32, 0 ; GCN: s_and_saveexec_b64 s[0:1], vcc ; GCN: s_cbranch_execz BB2_2 -; GCN: v_mov_b32_e32 v0, 24 -; GCN: v_mov_b32_e32 v1, 0 -; GCN: ds_write_b16 v1, v0 offset:18 -; GCN: v_mov_b32_e32 v0, 32 -; GCN: ds_write_b16 v1, v0 offset:16 +; GCN: v_mov_b32_e32 v0, 0 +; GCN: v_mov_b32_e32 v1, 0x180020 +; GCN: ds_write_b32 v0, v1 offset:16 ; GCN-LABEL: BB2_2: ; GCN: s_or_b64 exec, exec, s[0:1] ; GCN: s_getpc_b64 s[0:1]