Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1516,25 +1516,23 @@ if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || AddrSpace == AMDGPUAS::REGION_ADDRESS) { + // Check if alignment requirements for ds_read/write instructions are + // disabled. + if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4)) + return false; + Align RequiredAlignment(PowerOf2Ceil(Size/8)); // Natural alignment. if (Subtarget->hasLDSMisalignedBug() && Size > 32 && Alignment < RequiredAlignment) return false; - // Check if alignment requirements for ds_read/write instructions are - // disabled. - if (Subtarget->hasUnalignedDSAccessEnabled()) { - if (IsFast) - *IsFast = Alignment != Align(2); - return true; - } - // Either, the alignment requirements are "enabled", or there is an // unaligned LDS access related hardware bug though alignment requirements // are "disabled". In either case, we need to check for proper alignment // requirements. // - if (Size == 64) { + switch (Size) { + case 64: // SI has a hardware bug in the LDS / GDS bounds checking: if the base // address is negative, then the instruction is incorrectly treated as // out-of-bounds even if base + offsets is in bounds. Split vectorized @@ -1546,31 +1544,41 @@ // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we // can do a 4 byte aligned, 8 byte access in a single operation using // ds_read2/write2_b32 with adjacent offsets. - bool AlignedBy4 = Alignment >= Align(4); - if (IsFast) - *IsFast = AlignedBy4; + RequiredAlignment = Align(4); + break; + case 96: + if (!Subtarget->hasDS96AndDS128()) + return false; - return AlignedBy4; - } - if (Size == 96) { // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on // gfx8 and older. - bool AlignedBy16 = Alignment >= Align(16); - if (IsFast) - *IsFast = AlignedBy16; + RequiredAlignment = Align(16); + break; + case 128: + if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128()) + return false; - return AlignedBy16; - } - if (Size == 128) { // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a // single operation using ds_read2/write2_b64. - bool AlignedBy8 = Alignment >= Align(8); - if (IsFast) - *IsFast = AlignedBy8; + RequiredAlignment = Align(8); + break; + default: + if (Size > 32) + return false; - return AlignedBy8; + break; } + + if (IsFast) { + // FIXME: Lie it is fast if +unaligned-access-mode is passed so that + // DS accesses get vectorized. + *IsFast = Alignment >= RequiredAlignment || + Subtarget->hasUnalignedDSAccessEnabled(); + } + + return Alignment >= RequiredAlignment || + Subtarget->hasUnalignedDSAccessEnabled(); } if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) { @@ -1595,9 +1603,7 @@ return AlignedBy4; } - if (Subtarget->hasUnalignedBufferAccessEnabled() && - !(AddrSpace == AMDGPUAS::LOCAL_ADDRESS || - AddrSpace == AMDGPUAS::REGION_ADDRESS)) { + if (Subtarget->hasUnalignedBufferAccessEnabled()) { // If we have a uniform constant load, it still requires using a slow // buffer instruction if unaligned. if (IsFast) { @@ -8519,27 +8525,15 @@ llvm_unreachable("unsupported private_element_size"); } } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { - // Use ds_read_b128 or ds_read_b96 when possible. - if (Subtarget->hasDS96AndDS128() && - ((Subtarget->useDS128() && MemVT.getStoreSize() == 16) || - MemVT.getStoreSize() == 12) && - allowsMisalignedMemoryAccessesImpl(MemVT.getSizeInBits(), AS, - Load->getAlign())) + bool Fast = false; + auto Flags = Load->getMemOperand()->getFlags(); + if (allowsMisalignedMemoryAccessesImpl(MemVT.getSizeInBits(), AS, + Load->getAlign(), Flags, &Fast) && + Fast) return SDValue(); - if (NumElements > 2) + if (MemVT.isVector()) return SplitVectorLoad(Op, DAG); - - // SI has a hardware bug in the LDS / GDS bounds checking: if the base - // address is negative, then the instruction is incorrectly treated as - // out-of-bounds even if base + offsets is in bounds. Split vectorized - // loads here to avoid emitting ds_read2_b32. We may re-combine the - // load later in the SILoadStoreOptimizer. - if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && - NumElements == 2 && MemVT.getStoreSize() == 8 && - Load->getAlignment() < 8) { - return SplitVectorLoad(Op, DAG); - } } if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), @@ -9030,36 +9024,17 @@ llvm_unreachable("unsupported private_element_size"); } } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { - // Use ds_write_b128 or ds_write_b96 when possible. - if (Subtarget->hasDS96AndDS128() && - ((Subtarget->useDS128() && VT.getStoreSize() == 16) || - (VT.getStoreSize() == 12)) && - allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AS, - Store->getAlign())) + bool Fast = false; + auto Flags = Store->getMemOperand()->getFlags(); + if (allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AS, + Store->getAlign(), Flags, &Fast) && + Fast) return SDValue(); - if (NumElements > 2) + if (VT.isVector()) return SplitVectorStore(Op, DAG); - // SI has a hardware bug in the LDS / GDS bounds checking: if the base - // address is negative, then the instruction is incorrectly treated as - // out-of-bounds even if base + offsets is in bounds. Split vectorized - // stores here to avoid emitting ds_write2_b32. We may re-combine the - // store later in the SILoadStoreOptimizer. - if (!Subtarget->hasUsableDSOffset() && - NumElements == 2 && VT.getStoreSize() == 8 && - Store->getAlignment() < 8) { - return SplitVectorStore(Op, DAG); - } - - if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), - VT, *Store->getMemOperand())) { - if (VT.isVector()) - return SplitVectorStore(Op, DAG); - return expandUnalignedStore(Store, DAG); - } - - return SDValue(); + return expandUnalignedStore(Store, DAG); } else { llvm_unreachable("unhandled address space"); } Index: llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll +++ llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll @@ -66,41 +66,40 @@ ; CHECK-NEXT: s_mov_b32 s6, s4 ; CHECK-NEXT: s_mov_b32 s5, s3 ; CHECK-NEXT: s_mov_b32 s4, s2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, 4, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, 20, v1 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, 16, v1 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, 12, v1 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, 8, v1 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, 12, v1 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, 16, v1 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, 20, v1 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, 4, v1 ; CHECK-NEXT: v_mov_b32_e32 v9, s0 -; CHECK-NEXT: v_add_i32_e32 v10, vcc, 4, v2 -; CHECK-NEXT: v_add_i32_e32 v11, vcc, 8, v2 -; CHECK-NEXT: v_add_i32_e32 v12, vcc, 12, v2 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, 20, v2 +; CHECK-NEXT: v_add_i32_e32 v11, vcc, 16, v2 ; CHECK-NEXT: s_mov_b32 m0, -1 -; CHECK-NEXT: ds_read_b32 v3, v1 -; CHECK-NEXT: ds_read_b32 v4, v0 +; CHECK-NEXT: ds_read_b32 v7, v3 +; CHECK-NEXT: ds_read_b32 v6, v4 ; CHECK-NEXT: ds_read_b32 v5, v5 -; CHECK-NEXT: ds_read_b32 v6, v6 -; CHECK-NEXT: ds_read_b32 v0, v7 -; CHECK-NEXT: ds_read_b32 v1, v8 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, 16, v2 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, 20, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: tbuffer_store_format_xyzw v[3:6], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:264 glc slc +; CHECK-NEXT: ds_read_b32 v4, v8 +; CHECK-NEXT: ds_read_b32 v8, v0 +; CHECK-NEXT: ds_read_b32 v3, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, 12, v2 +; CHECK-NEXT: v_add_i32_e32 v12, vcc, 8, v2 +; CHECK-NEXT: v_add_i32_e32 v13, vcc, 4, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: tbuffer_store_format_xy v[0:1], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:280 glc slc -; CHECK-NEXT: s_waitcnt expcnt(0) -; CHECK-NEXT: ds_read_b32 v0, v2 +; CHECK-NEXT: tbuffer_store_format_xyzw v[3:6], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:264 glc slc +; CHECK-NEXT: tbuffer_store_format_xy v[7:8], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:280 glc slc +; CHECK-NEXT: ds_read_b32 v0, v11 +; CHECK-NEXT: s_waitcnt expcnt(1) +; CHECK-NEXT: ds_read_b32 v5, v1 +; CHECK-NEXT: ds_read_b32 v4, v12 +; CHECK-NEXT: ds_read_b32 v3, v13 +; CHECK-NEXT: ds_read_b32 v2, v2 ; CHECK-NEXT: ds_read_b32 v1, v10 -; CHECK-NEXT: ds_read_b32 v2, v11 -; CHECK-NEXT: ds_read_b32 v3, v12 -; CHECK-NEXT: ds_read_b32 v4, v7 -; CHECK-NEXT: ds_read_b32 v5, v8 ; CHECK-NEXT: s_waitcnt lgkmcnt(5) ; CHECK-NEXT: exp mrt0 off, off, off, off -; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: tbuffer_store_format_xyzw v[0:3], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:240 glc slc +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: tbuffer_store_format_xyzw v[2:5], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:240 glc slc ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: tbuffer_store_format_xy v[4:5], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:256 glc slc +; CHECK-NEXT: tbuffer_store_format_xy v[0:1], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:256 glc slc ; CHECK-NEXT: s_endpgm %load1 = load <6 x float>, <6 x float> addrspace(3)* %arg5, align 4 %vec11 = shufflevector <6 x float> %load1, <6 x float> undef, <4 x i32> Index: llvm/test/CodeGen/AMDGPU/store-local.128.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/store-local.128.ll +++ llvm/test/CodeGen/AMDGPU/store-local.128.ll @@ -462,12 +462,12 @@ ; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v4, s0 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s5 -; GFX6-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset1:1 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; GFX6-NEXT: s_endpgm ; ; GFX10-LABEL: store_lds_v4i32_align8: