Index: llvm/lib/Target/AMDGPU/DSInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/DSInstructions.td +++ llvm/lib/Target/AMDGPU/DSInstructions.td @@ -877,6 +877,15 @@ let SubtargetPredicate = HasUnalignedAccessMode in { +// Select 64 bit loads and stores aligned less than 4 as a single ds_read_b64/ +// ds_write_b64 instruction as this is faster than ds_read2_b32/ds_write2_b32 +// which would be used otherwise. In this case a b32 access would still be +// misaligned, but we will have 2 of them. +foreach vt = VReg_64.RegTypes in { +defm : DSReadPat_mc ; +defm : DSWritePat_mc ; +} + // Selection will split most of the unaligned 3 dword accesses due to performance // reasons when beneficial. Keep these two patterns for the rest of the cases. foreach vt = VReg_96.RegTypes in { Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1545,6 +1545,16 @@ // can do a 4 byte aligned, 8 byte access in a single operation using // ds_read2/write2_b32 with adjacent offsets. RequiredAlignment = Align(4); + + if (Subtarget->hasUnalignedDSAccessEnabled()) { + // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/ + // ds_write2_b32 depending on the alignment. In either case with either + // alignment there is no faster way of doing this. + if (IsFast) + *IsFast = true; + return true; + } + break; case 96: if (!Subtarget->hasDS96AndDS128()) @@ -1593,14 +1603,8 @@ break; } - if (IsFast) { - // FIXME: Lie it is fast if +unaligned-access-mode is passed so that - // DS accesses get vectorized. Do this only for sizes below 96 as - // b96 and b128 cases already properly handled. - // Remove Subtarget check once all sizes properly handled. - *IsFast = Alignment >= RequiredAlignment || - (Subtarget->hasUnalignedDSAccessEnabled() && Size < 96); - } + if (IsFast) + *IsFast = Alignment >= RequiredAlignment; return Alignment >= RequiredAlignment || Subtarget->hasUnalignedDSAccessEnabled(); Index: llvm/test/CodeGen/AMDGPU/ds-alignment.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ds-alignment.ll +++ llvm/test/CodeGen/AMDGPU/ds-alignment.ll @@ -278,10 +278,10 @@ ; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; UNALIGNED-NEXT: ds_read_b64 v[0:1], v0 ; UNALIGNED-NEXT: v_mov_b32_e32 v2, s1 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 +; UNALIGNED-NEXT: ds_write_b64 v2, v[0:1] ; UNALIGNED-NEXT: s_endpgm %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 1 store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 1 @@ -334,10 +334,10 @@ ; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; UNALIGNED-NEXT: ds_read_b64 v[0:1], v0 ; UNALIGNED-NEXT: v_mov_b32_e32 v2, s1 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 +; UNALIGNED-NEXT: ds_write_b64 v2, v[0:1] ; UNALIGNED-NEXT: s_endpgm %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 2 store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 2 Index: llvm/test/CodeGen/AMDGPU/ds_read2.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -691,8 +691,8 @@ ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_add3_u32 v0, s2, v2, 5 -; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s2, v2 +; GFX9-UNALIGNED-NEXT: ds_read_b64 v[0:1], v0 offset:5 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[0:1] @@ -1530,10 +1530,9 @@ ; ; GFX9-UNALIGNED-LABEL: read2_v2i32_align1_odd_offset: ; GFX9-UNALIGNED: ; %bb.0: ; %entry -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x41 -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-UNALIGNED-NEXT: ds_read_b64 v[0:1], v2 offset:65 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-UNALIGNED-NEXT: s_endpgm Index: llvm/test/CodeGen/AMDGPU/ds_write2.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -708,11 +708,9 @@ ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, s4, v2 -; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v3, 5, v2 -; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, 9, v2 ; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-UNALIGNED-NEXT: ds_write2_b32 v3, v0, v1 offset1:1 -; GFX9-UNALIGNED-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 +; GFX9-UNALIGNED-NEXT: ds_write_b64 v2, v[0:1] offset:5 +; GFX9-UNALIGNED-NEXT: ds_write_b64 v2, v[0:1] offset:9 ; GFX9-UNALIGNED-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i @@ -1043,10 +1041,10 @@ ; ; GFX9-UNALIGNED-LABEL: write2_v2i32_align1_odd_offset: ; GFX9-UNALIGNED: ; %bb.0: ; %entry -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x41 -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x1c8 -; GFX9-UNALIGNED-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 +; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, 0x1c8 +; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-UNALIGNED-NEXT: ds_write_b64 v2, v[0:1] offset:65 ; GFX9-UNALIGNED-NEXT: s_endpgm entry: store <2 x i32> , <2 x i32> addrspace(3)* bitcast (i8 addrspace(3)* getelementptr (i8, i8 addrspace(3)* bitcast ([100 x <2 x i32>] addrspace(3)* @v2i32_align1 to i8 addrspace(3)*), i32 65) to <2 x i32> addrspace(3)*), align 1