Index: llvm/lib/Target/AMDGPU/DSInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/DSInstructions.td +++ llvm/lib/Target/AMDGPU/DSInstructions.td @@ -877,8 +877,8 @@ let SubtargetPredicate = HasUnalignedAccessMode in { -// FIXME: From performance point of view, is ds_read_b96/ds_write_b96 better choice -// for unaligned accesses? +// Selection will split most of the unaligned 3 dword accesses due to performance +// reasons when beneficial. Keep these two patterns for the rest of the cases. foreach vt = VReg_96.RegTypes in { defm : DSReadPat_mc ; defm : DSWritePat_mc ; Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1553,6 +1553,18 @@ // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on // gfx8 and older. RequiredAlignment = Align(16); + + if (Subtarget->hasUnalignedDSAccessEnabled()) { + // Naturally aligned access is fastest. However, also report it is Fast + // if memory is aligned less than DWORD. A narrow load or store will be + // be equally slow as a single ds_read_b96/ds_write_b96, but there will + // be more of them, so overall we will pay less penalty issuing a single + // instruction. + if (IsFast) + *IsFast = Alignment >= RequiredAlignment || Alignment < Align(4); + return true; + } + break; case 128: if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128()) Index: llvm/test/CodeGen/AMDGPU/ds-alignment.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ds-alignment.ll +++ llvm/test/CodeGen/AMDGPU/ds-alignment.ll @@ -566,23 +566,11 @@ ; ALIGNED-NEXT: s_waitcnt lgkmcnt(1) ; ALIGNED-NEXT: ds_write_b32 v3, v2 offset:8 ; ALIGNED-NEXT: s_endpgm -; -; UNALIGNED-LABEL: ds12align4: -; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0 -; UNALIGNED-NEXT: v_mov_b32_e32 v3, s1 -; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-NEXT: ds_write_b96 v3, v[0:2] -; UNALIGNED-NEXT: s_endpgm %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 4 store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 4 ret void } -; TODO: Why does the ALIGNED-SDAG code use ds_write_b64 but not ds_read_b64? define amdgpu_kernel void @ds12align8(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) { ; ALIGNED-SDAG-LABEL: ds12align8: ; ALIGNED-SDAG: ; %bb.0: @@ -611,17 +599,6 @@ ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) ; ALIGNED-GISEL-NEXT: ds_write_b32 v3, v2 offset:8 ; ALIGNED-GISEL-NEXT: s_endpgm -; -; UNALIGNED-LABEL: ds12align8: -; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0 -; UNALIGNED-NEXT: v_mov_b32_e32 v3, s1 -; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-NEXT: ds_write_b96 v3, v[0:2] -; UNALIGNED-NEXT: s_endpgm %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 8 store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 8 ret void Index: llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll +++ llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll @@ -47,12 +47,10 @@ } ; GCN-LABEL: test_local_misaligned_v3: -; ALIGNED-DAG: ds_read2_b32 -; ALIGNED-DAG: ds_read_b32 -; ALIGNED-DAG: ds_write2_b32 -; ALIGNED-DAG: ds_write_b32 -; UNALIGNED-DAG: ds_read_b96 -; UNALIGNED-DAG: ds_write_b96 +; GCN-DAG: ds_read2_b32 +; GCN-DAG: ds_read_b32 +; GCN-DAG: ds_write2_b32 +; GCN-DAG: ds_write_b32 define amdgpu_kernel void @test_local_misaligned_v3(i32 addrspace(3)* %arg) { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x()