diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -713,6 +713,8 @@ defm : DSReadPat_mc ; defm : DSReadPat_mc ; +// Prefer ds_read over ds_read2, all other things being equal, because it has +// a larger immediate offset range. let AddedComplexity = 100 in { foreach vt = VReg_64.RegTypes in { @@ -725,6 +727,9 @@ defm : DSReadPat_mc ; } +// For performance reasons restrict this to alignment >= 16 even with +// unaligned-access-mode. At lower alignments ds_read2_b64 is always a better +// choice. foreach vt = VReg_128.RegTypes in { defm : DSReadPat_mc ; } @@ -735,10 +740,6 @@ defm : DSReadPat_mc ; } -foreach vt = VReg_128.RegTypes in { -defm : DSReadPat_mc ; -} - } // End SubtargetPredicate = HasUnalignedAccessMode } // End SubtargetPredicate = isGFX7Plus @@ -868,6 +869,8 @@ defm : DS128Bit8ByteAlignedPat_mc; } +// Prefer ds_write over ds_write2, all other things being equal, because it has +// a larger immediate offset range. let AddedComplexity = 100 in { foreach vt = VReg_64.RegTypes in { @@ -880,6 +883,9 @@ defm : DSWritePat_mc ; } +// For performance reasons restrict this to alignment >= 16 even with +// unaligned-access-mode. At lower alignments ds_write2_b64 is always a better +// choice. foreach vt = VReg_128.RegTypes in { defm : DSWritePat_mc ; } @@ -890,10 +896,6 @@ defm : DSWritePat_mc ; } -foreach vt = VReg_128.RegTypes in { -defm : DSWritePat_mc ; -} - } // End SubtargetPredicate = HasUnalignedAccessMode } // End SubtargetPredicate = isGFX7Plus diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll @@ -26,8 +26,8 @@ ; ALIGNED-DAG: ds_read2_b32 ; ALIGNED-DAG: ds_write2_b32 ; ALIGNED-DAG: ds_write2_b32 -; UNALIGNED-DAG: ds_read_b128 -; UNALIGNED-DAG: ds_write_b128 +; UNALIGNED-DAG: ds_read2_b64 +; UNALIGNED-DAG: ds_write2_b64 define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -108,8 +108,8 @@ ; GCN-LABEL: test_local_v4_aligned8: ; ALIGNED-DAG: ds_read2_b64 ; ALIGNED-DAG: ds_write2_b64 -; UNALIGNED-DAG: ds_read_b128 -; UNALIGNED-DAG: ds_write_b128 +; UNALIGNED-DAG: ds_read2_b64 +; UNALIGNED-DAG: ds_write2_b64 define amdgpu_kernel void @test_local_v4_aligned8(i32 addrspace(3)* %arg) { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll @@ -11,7 +11,7 @@ ; GFX9-LABEL: load_lds_v4i32_align1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_read_b128 v[0:3], v0 +; GFX9-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -293,7 +293,7 @@ ; GFX9-LABEL: store_lds_v4i32_align1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_write_b128 v0, v[1:4] +; GFX9-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll --- a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll @@ -698,10 +698,10 @@ ; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; UNALIGNED-NEXT: ds_read_b128 v[0:3], v0 +; UNALIGNED-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 ; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-NEXT: ds_write_b128 v4, v[0:3] +; UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; UNALIGNED-NEXT: s_endpgm %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 1 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 1 @@ -772,10 +772,10 @@ ; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; UNALIGNED-NEXT: ds_read_b128 v[0:3], v0 +; UNALIGNED-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 ; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-NEXT: ds_write_b128 v4, v[0:3] +; UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; UNALIGNED-NEXT: s_endpgm %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 2 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 2 @@ -815,10 +815,10 @@ ; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; UNALIGNED-NEXT: ds_read_b128 v[0:3], v0 +; UNALIGNED-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 ; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-NEXT: ds_write_b128 v4, v[0:3] +; UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; UNALIGNED-NEXT: s_endpgm %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 4 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 4 @@ -826,27 +826,16 @@ } define amdgpu_kernel void @ds16align8(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) { -; ALIGNED-LABEL: ds16align8: -; ALIGNED: ; %bb.0: -; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; ALIGNED-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 -; ALIGNED-NEXT: v_mov_b32_e32 v4, s1 -; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 -; ALIGNED-NEXT: s_endpgm -; -; UNALIGNED-LABEL: ds16align8: -; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; UNALIGNED-NEXT: ds_read_b128 v[0:3], v0 -; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1 -; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-NEXT: ds_write_b128 v4, v[0:3] -; UNALIGNED-NEXT: s_endpgm +; GCN-LABEL: ds16align8: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 +; GCN-NEXT: v_mov_b32_e32 v4, s1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 +; GCN-NEXT: s_endpgm %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 8 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 8 ret void diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -1035,7 +1035,7 @@ ; GFX9-UNALIGNED-LABEL: load_misaligned64_constant_offsets: ; GFX9-UNALIGNED: ; %bb.0: ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-UNALIGNED-NEXT: ds_read_b128 v[0:3], v4 +; GFX9-UNALIGNED-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 ; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -837,11 +837,16 @@ ; ; GFX9-UNALIGNED-LABEL: store_misaligned64_constant_offsets: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x7b -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-UNALIGNED-NEXT: ds_write_b128 v1, v[0:3] +; GFX9-UNALIGNED-NEXT: s_movk_i32 s0, 0x7b +; GFX9-UNALIGNED-NEXT: s_mov_b32 s1, 0 +; GFX9-UNALIGNED-NEXT: s_mov_b32 s2, s0 +; GFX9-UNALIGNED-NEXT: s_mov_b32 s3, s1 +; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; GFX9-UNALIGNED-NEXT: s_endpgm store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 @@ -1000,10 +1005,10 @@ ; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-UNALIGNED-NEXT: ds_write_b128 v4, v[0:3] +; GFX9-UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; GFX9-UNALIGNED-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in diff --git a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll --- a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll @@ -26,8 +26,8 @@ ; ALIGNED-DAG: ds_read2_b32 ; ALIGNED-DAG: ds_write2_b32 ; ALIGNED-DAG: ds_write2_b32 -; UNALIGNED-DAG: ds_read_b128 -; UNALIGNED-DAG: ds_write_b128 +; UNALIGNED-DAG: ds_read2_b64 +; UNALIGNED-DAG: ds_write2_b64 define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -223,8 +223,8 @@ ; GCN-LABEL: test_local_v4_aligned8: ; ALIGNED-DAG: ds_read2_b64 ; ALIGNED-DAG: ds_write2_b64 -; UNALIGNED-DAG: ds_read_b128 -; UNALIGNED-DAG: ds_write_b128 +; UNALIGNED-DAG: ds_read2_b64 +; UNALIGNED-DAG: ds_write2_b64 define amdgpu_kernel void @test_local_v4_aligned8(i32 addrspace(3)* %arg) { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x()