Index: llvm/lib/Target/AMDGPU/DSInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/DSInstructions.td +++ llvm/lib/Target/AMDGPU/DSInstructions.td @@ -884,8 +884,14 @@ defm : DSWritePat_mc ; } -// For performance reasons, *do not* select ds_read_b128/ds_write_b128 for unaligned -// accesses. +// Select 128 bit loads and stores aligned less than 4 as a single ds_read_b128/ +// ds_write_b128 as a single instruction as this is faster than ds_read2_b64/ +// ds_write2_b64 which would be used otherwise. In this case a b64 access would +// still be misaligned, but we will have 2 of them. +foreach vt = VReg_128.RegTypes in { +defm : DSReadPat_mc ; +defm : DSWritePat_mc ; +} } // End SubtargetPredicate = HasUnalignedAccessMode Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1574,6 +1574,18 @@ // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a // single operation using ds_read2/write2_b64. RequiredAlignment = Align(8); + + if (Subtarget->hasUnalignedDSAccessEnabled()) { + // Naturally aligned access is fastest. However, also report it is Fast + // if memory is aligned less than DWORD. A narrow load or store will be + // be equally slow as a single ds_read_b128/ds_write_b128, but there + // will be more of them, so overall we will pay less penalty issuing a + // single instruction. + if (IsFast) + *IsFast = Alignment >= RequiredAlignment || Alignment < Align(4); + return true; + } + break; default: if (Size > 32) @@ -1584,9 +1596,11 @@ if (IsFast) { // FIXME: Lie it is fast if +unaligned-access-mode is passed so that - // DS accesses get vectorized. + // DS accesses get vectorized. Do this only for sizes below 96 as + // b96 and b128 cases already properly handled. + // Remove Subtarget check once all sizes properly handled. *IsFast = Alignment >= RequiredAlignment || - Subtarget->hasUnalignedDSAccessEnabled(); + (Subtarget->hasUnalignedDSAccessEnabled() && Size < 96); } return Alignment >= RequiredAlignment || @@ -1657,8 +1671,22 @@ return false; } - return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace, - Alignment, Flags, IsFast); + bool Allow = allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace, + Alignment, Flags, IsFast); + + if (Allow && IsFast && Subtarget->hasUnalignedDSAccessEnabled() && + (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || + AddrSpace == AMDGPUAS::REGION_ADDRESS)) { + // Lie it is fast if +unaligned-access-mode is passed so that DS accesses + // get vectorized. We could use ds_read2_b*/ds_write2_b* instructions on a + // misaligned data which is faster than a pair of ds_read_b*/ds_write_b* + // which would be equally misaligned. + // This is only used by the common passes, selection always calls the + // allowsMisalignedMemoryAccessesImpl version. + *IsFast = true; + } + + return Allow; } EVT SITargetLowering::getOptimalMemOpType( Index: llvm/lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -585,6 +585,34 @@ let IsTruncStore = 0; } +let PredicateCode = [{return cast(N)->getAlignment() < 4;}], + GISelPredicateCode = [{return (*MI.memoperands_begin())->getAlign() < 4;}], + AddressSpaces = [ AddrSpaces.Local ] in { +def load_align_less_than_4_local : PatFrag<(ops node:$ptr), + (load_local node:$ptr)> { + let IsLoad = 1; + let IsNonExtLoad = 1; +} + +def load_align_less_than_4_local_m0 : PatFrag<(ops node:$ptr), + (load_local_m0 node:$ptr)> { + let IsLoad = 1; + let IsNonExtLoad = 1; +} + +def store_align_less_than_4_local : PatFrag <(ops node:$value, node:$ptr), + (store_local node:$value, node:$ptr)> { + let IsStore = 1; + let IsTruncStore = 0; +} + +def store_align_less_than_4_local_m0 : PatFrag <(ops node:$value, node:$ptr), + (store_local_m0 node:$value, node:$ptr)> { + let IsStore = 1; + let IsTruncStore = 0; +} +} + let AddressSpaces = StoreAddress_local.AddrSpaces in { def atomic_store_local_8_m0 : PatFrag < Index: llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll @@ -11,7 +11,7 @@ ; GFX9-LABEL: load_lds_v4i32_align1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 +; GFX9-NEXT: ds_read_b128 v[0:3], v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -153,7 +153,7 @@ ; GFX9-LABEL: store_lds_v4i32_align1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 +; GFX9-NEXT: ds_write_b128 v0, v[1:4] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; Index: llvm/test/CodeGen/AMDGPU/ds-alignment.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ds-alignment.ll +++ llvm/test/CodeGen/AMDGPU/ds-alignment.ll @@ -798,10 +798,10 @@ ; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; UNALIGNED-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 +; UNALIGNED-NEXT: ds_read_b128 v[0:3], v0 ; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 +; UNALIGNED-NEXT: ds_write_b128 v4, v[0:3] ; UNALIGNED-NEXT: s_endpgm %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 1 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 1 @@ -874,10 +874,10 @@ ; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; UNALIGNED-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 +; UNALIGNED-NEXT: ds_read_b128 v[0:3], v0 ; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 +; UNALIGNED-NEXT: ds_write_b128 v4, v[0:3] ; UNALIGNED-NEXT: s_endpgm %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 2 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 2 @@ -899,16 +899,30 @@ ; ALIGNED-NEXT: ds_write2_b32 v4, v2, v3 offset0:2 offset1:3 ; ALIGNED-NEXT: s_endpgm ; -; UNALIGNED-LABEL: ds16align4: -; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; UNALIGNED-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 -; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1 -; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 -; UNALIGNED-NEXT: s_endpgm +; UNALIGNED-SDAG-LABEL: ds16align4: +; UNALIGNED-SDAG: ; %bb.0: +; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; UNALIGNED-SDAG-NEXT: ds_read2_b32 v[0:1], v2 offset0:2 offset1:3 +; UNALIGNED-SDAG-NEXT: ds_read2_b32 v[2:3], v2 offset1:1 +; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1 +; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) +; UNALIGNED-SDAG-NEXT: ds_write2_b32 v4, v0, v1 offset0:2 offset1:3 +; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) +; UNALIGNED-SDAG-NEXT: ds_write2_b32 v4, v2, v3 offset1:1 +; UNALIGNED-SDAG-NEXT: s_endpgm +; +; UNALIGNED-GISEL-LABEL: ds16align4: +; UNALIGNED-GISEL: ; %bb.0: +; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; UNALIGNED-GISEL-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 +; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1 +; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GISEL-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 +; UNALIGNED-GISEL-NEXT: s_endpgm %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 4 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 4 ret void Index: llvm/test/CodeGen/AMDGPU/ds_write2.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -987,14 +987,15 @@ ; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 ; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_lshl_add_u32 v4, v0, 4, s4 +; GFX9-UNALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s4 ; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 +; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-UNALIGNED-NEXT: ds_write2_b32 v0, v1, v2 offset0:2 offset1:3 +; GFX9-UNALIGNED-NEXT: ds_write2_b32 v0, v3, v4 offset1:1 ; GFX9-UNALIGNED-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in Index: llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll +++ llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll @@ -22,12 +22,10 @@ } ; GCN-LABEL: test_local_misaligned_v4: -; ALIGNED-DAG: ds_read2_b32 -; ALIGNED-DAG: ds_read2_b32 -; ALIGNED-DAG: ds_write2_b32 -; ALIGNED-DAG: ds_write2_b32 -; UNALIGNED-DAG: ds_read2_b64 -; UNALIGNED-DAG: ds_write2_b64 +; GCN-DAG: ds_read2_b32 +; GCN-DAG: ds_read2_b32 +; GCN-DAG: ds_write2_b32 +; GCN-DAG: ds_write2_b32 define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x()