diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -713,38 +713,6 @@ defm : DSReadPat_mc ; defm : DSReadPat_mc ; -let AddedComplexity = 100 in { - -foreach vt = VReg_64.RegTypes in { -defm : DSReadPat_mc ; -} - -let SubtargetPredicate = isGFX7Plus in { - -foreach vt = VReg_96.RegTypes in { -defm : DSReadPat_mc ; -} - -foreach vt = VReg_128.RegTypes in { -defm : DSReadPat_mc ; -} - -let SubtargetPredicate = HasUnalignedAccessMode in { - -foreach vt = VReg_96.RegTypes in { -defm : DSReadPat_mc ; -} - -foreach vt = VReg_128.RegTypes in { -defm : DSReadPat_mc ; -} - -} // End SubtargetPredicate = HasUnalignedAccessMode - -} // End SubtargetPredicate = isGFX7Plus - -} // End AddedComplexity = 100 - let OtherPredicates = [D16PreservesUnusedBits] in { def : DSReadPat_D16; def : DSReadPat_D16; @@ -870,6 +838,10 @@ let AddedComplexity = 100 in { +foreach vt = VReg_64.RegTypes in { +defm : DSReadPat_mc ; +} + foreach vt = VReg_64.RegTypes in { defm : DSWritePat_mc ; } @@ -877,24 +849,20 @@ let SubtargetPredicate = isGFX7Plus in { foreach vt = VReg_96.RegTypes in { -defm : DSWritePat_mc ; -} - -foreach vt = VReg_128.RegTypes in { -defm : DSWritePat_mc ; +defm : DSReadPat_mc ; } -let SubtargetPredicate = HasUnalignedAccessMode in { - foreach vt = VReg_96.RegTypes in { -defm : DSWritePat_mc ; +defm : DSWritePat_mc ; } foreach vt = VReg_128.RegTypes in { -defm : DSWritePat_mc ; +defm : DSReadPat_mc ; } -} // End SubtargetPredicate = HasUnalignedAccessMode +foreach vt = VReg_128.RegTypes in { +defm : DSWritePat_mc ; +} } // End SubtargetPredicate = isGFX7Plus diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1403,42 +1403,32 @@ if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || AddrSpace == AMDGPUAS::REGION_ADDRESS) { - // Check if alignment requirements for ds_read/write instructions are - // disabled. - if (Subtarget->hasUnalignedDSAccessEnabled() && - !Subtarget->hasLDSMisalignedBug()) { - if (IsFast) - *IsFast = Alignment != Align(2); - return true; - } - if (Size == 64) { - // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte - // aligned, 8 byte access in a single operation using ds_read2/write2_b32 - // with adjacent offsets. - bool AlignedBy4 = Alignment >= Align(4); + // ds_read/write_b64 always require 8-byte alignment for performance + // reasons. + bool AlignedBy8 = Alignment >= Align(8); if (IsFast) - *IsFast = AlignedBy4; + *IsFast = AlignedBy8; - return AlignedBy4; + return AlignedBy8; } if (Size == 96) { - // ds_read/write_b96 require 16-byte alignment on gfx8 and older. - bool Aligned = Alignment >= Align(16); + // ds_read/write_b96 always require 16-byte alignment for performance + // reasons. + bool AlignedBy16 = Alignment >= Align(16); if (IsFast) - *IsFast = Aligned; + *IsFast = AlignedBy16; - return Aligned; + return AlignedBy16; } if (Size == 128) { - // ds_read/write_b128 require 16-byte alignment on gfx8 and older, but we - // can do a 8 byte aligned, 16 byte access in a single operation using - // ds_read2/write2_b64. - bool Aligned = Alignment >= Align(8); + // ds_read/write_b128 always require 16-byte alignment for performance + // reasons. + bool AlignedBy16 = Alignment >= Align(16); if (IsFast) - *IsFast = Aligned; + *IsFast = AlignedBy16; - return Aligned; + return AlignedBy16; } } diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -595,16 +595,32 @@ ; ; GFX9-UNALIGNED-LABEL: unaligned_read2_f32: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s4, v2 -; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:8 -; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[2:3] -; GFX9-UNALIGNED-NEXT: s_endpgm +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v1, s4, v0 +; GFX9-UNALIGNED-NEXT: ds_read_u8 v2, v1 +; GFX9-UNALIGNED-NEXT: ds_read_u8 v3, v1 offset:1 +; GFX9-UNALIGNED-NEXT: ds_read_u8 v4, v1 offset:2 +; GFX9-UNALIGNED-NEXT: ds_read_u8 v5, v1 offset:3 +; GFX9-UNALIGNED-NEXT: ds_read_u8 v6, v1 offset:32 +; GFX9-UNALIGNED-NEXT: ds_read_u8 v7, v1 offset:33 +; GFX9-UNALIGNED-NEXT: ds_read_u8 v8, v1 offset:34 +; GFX9-UNALIGNED-NEXT: ds_read_u8 v1, v1 offset:35 +; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(6) +; GFX9-UNALIGNED-NEXT: v_lshl_or_b32 v2, v3, 8, v2 +; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(4) +; GFX9-UNALIGNED-NEXT: v_lshl_or_b32 v3, v5, 8, v4 +; GFX9-UNALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-UNALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v6 +; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-UNALIGNED-NEXT: v_lshl_or_b32 v1, v1, 8, v8 +; GFX9-UNALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX9-UNALIGNED-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-UNALIGNED-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i %val0 = load float, float addrspace(3)* %arrayidx0, align 1 @@ -689,16 +705,32 @@ ; ; GFX9-UNALIGNED-LABEL: unaligned_offset_read2_f32: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_add3_u32 v0, s4, v2, 5 -; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 -; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[2:3] -; GFX9-UNALIGNED-NEXT: s_endpgm +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v1, s4, v0 +; GFX9-UNALIGNED-NEXT: ds_read_u8 v2, v1 offset:5 +; GFX9-UNALIGNED-NEXT: ds_read_u8 v3, v1 offset:6 +; GFX9-UNALIGNED-NEXT: ds_read_u8 v4, v1 offset:7 +; GFX9-UNALIGNED-NEXT: ds_read_u8 v5, v1 offset:8 +; GFX9-UNALIGNED-NEXT: ds_read_u8 v6, v1 offset:9 +; GFX9-UNALIGNED-NEXT: ds_read_u8 v7, v1 offset:10 +; GFX9-UNALIGNED-NEXT: ds_read_u8 v8, v1 offset:11 +; GFX9-UNALIGNED-NEXT: ds_read_u8 v1, v1 offset:12 +; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(6) +; GFX9-UNALIGNED-NEXT: v_lshl_or_b32 v2, v3, 8, v2 +; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(4) +; GFX9-UNALIGNED-NEXT: v_lshl_or_b32 v3, v5, 8, v4 +; GFX9-UNALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-UNALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v6 +; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-UNALIGNED-NEXT: v_lshl_or_b32 v1, v1, 8, v8 +; GFX9-UNALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX9-UNALIGNED-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-UNALIGNED-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %base = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i %base.i8 = bitcast float addrspace(3)* %base to i8 addrspace(3)* @@ -763,16 +795,22 @@ ; ; GFX9-UNALIGNED-LABEL: misaligned_2_simple_read2_f32: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s4, v2 -; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:8 -; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[2:3] -; GFX9-UNALIGNED-NEXT: s_endpgm +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v1, s4, v0 +; GFX9-UNALIGNED-NEXT: ds_read_u16 v2, v1 +; GFX9-UNALIGNED-NEXT: ds_read_u16 v3, v1 offset:2 +; GFX9-UNALIGNED-NEXT: ds_read_u16 v4, v1 offset:32 +; GFX9-UNALIGNED-NEXT: ds_read_u16 v1, v1 offset:34 +; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-UNALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-UNALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v4 +; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX9-UNALIGNED-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-UNALIGNED-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i %val0 = load float, float addrspace(3)* %arrayidx0, align 2 @@ -1034,14 +1072,15 @@ ; ; GFX9-UNALIGNED-LABEL: load_misaligned64_constant_offsets: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-UNALIGNED-NEXT: ds_read_b128 v[0:3], v4 -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-UNALIGNED-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-UNALIGNED-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] -; GFX9-UNALIGNED-NEXT: s_endpgm +; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v4 offset1:1 +; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[2:3], v4 offset0:2 offset1:3 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-UNALIGNED-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-UNALIGNED-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-UNALIGNED-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX9-UNALIGNED-NEXT: s_endpgm %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 %sum = add i64 %val0, %val1 @@ -1054,33 +1093,37 @@ define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) { ; CI-LABEL: load_misaligned64_constant_large_offsets: ; CI: ; %bb.0: -; CI-NEXT: v_mov_b32_e32 v0, 0x4000 -; CI-NEXT: v_mov_b32_e32 v2, 0x7ff8 -; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 -; CI-NEXT: ds_read2_b32 v[2:3], v2 offset1:1 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; CI-NEXT: s_endpgm +; CI-NEXT: s_movk_i32 s4, 0x4000 +; CI-NEXT: v_add_i32_e64 v0, vcc, s4, 0 +; CI-NEXT: s_movk_i32 s4, 0x7c00 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: v_add_i32_e64 v2, vcc, s4, 0 +; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; CI-NEXT: ds_read2_b32 v[2:3], v2 offset0:254 offset1:255 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: s_endpgm ; ; GFX9-LABEL: load_misaligned64_constant_large_offsets: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, 0x4000 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7ff8 -; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 -; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset1:1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] -; GFX9-NEXT: s_endpgm +; GFX9-NEXT: s_movk_i32 s2, 0x4000 +; GFX9-NEXT: v_add_u32_e64 v0, s2, 0 +; GFX9-NEXT: s_movk_i32 s2, 0x7c00 +; GFX9-NEXT: v_add_u32_e64 v2, s2, 0 +; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:254 offset1:255 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4 %sum = add i64 %val0, %val1 @@ -1094,61 +1137,58 @@ define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb) #0 { ; CI-LABEL: sgemm_inner_loop_read2_sequence: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; CI-NEXT: s_lshl_b32 s0, s2, 2 -; CI-NEXT: s_add_i32 s1, s0, 0xc20 -; CI-NEXT: s_addk_i32 s0, 0xc60 -; CI-NEXT: v_mov_b32_e32 v0, s1 -; CI-NEXT: v_mov_b32_e32 v4, s0 -; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: ds_read2_b32 v[2:3], v0 offset1:1 -; CI-NEXT: ds_read2_b32 v[4:5], v4 offset1:1 -; CI-NEXT: v_lshlrev_b32_e32 v8, 2, v1 -; CI-NEXT: ds_read2_b32 v[0:1], v8 offset1:1 -; CI-NEXT: ds_read2_b32 v[6:7], v8 offset0:32 offset1:33 -; CI-NEXT: ds_read2_b32 v[8:9], v8 offset0:64 offset1:65 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_f32_e32 v2, v2, v3 -; CI-NEXT: v_add_f32_e32 v2, v2, v4 -; CI-NEXT: v_add_f32_e32 v2, v2, v5 -; CI-NEXT: v_add_f32_e32 v0, v2, v0 -; CI-NEXT: v_add_f32_e32 v0, v0, v1 -; CI-NEXT: v_add_f32_e32 v0, v0, v6 -; CI-NEXT: v_add_f32_e32 v0, v0, v7 -; CI-NEXT: v_add_f32_e32 v0, v0, v8 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: v_add_f32_e32 v0, v0, v9 -; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; CI-NEXT: s_endpgm +; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; CI-NEXT: s_lshl_b32 s0, s2, 2 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_movk_i32 s0, 0xc00 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read2_b32 v[2:3], v0 offset0:8 offset1:9 +; CI-NEXT: ds_read2_b32 v[4:5], v0 offset0:24 offset1:25 +; CI-NEXT: v_lshlrev_b32_e32 v8, 2, v1 +; CI-NEXT: ds_read2_b32 v[0:1], v8 offset1:1 +; CI-NEXT: ds_read2_b32 v[6:7], v8 offset0:32 offset1:33 +; CI-NEXT: ds_read2_b32 v[8:9], v8 offset0:64 offset1:65 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_add_f32_e32 v2, v2, v3 +; CI-NEXT: v_add_f32_e32 v2, v2, v4 +; CI-NEXT: v_add_f32_e32 v2, v2, v5 +; CI-NEXT: v_add_f32_e32 v0, v2, v0 +; CI-NEXT: v_add_f32_e32 v0, v0, v1 +; CI-NEXT: v_add_f32_e32 v0, v0, v6 +; CI-NEXT: v_add_f32_e32 v0, v0, v7 +; CI-NEXT: v_add_f32_e32 v0, v0, v8 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: v_add_f32_e32 v0, v0, v9 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: s_endpgm ; ; GFX9-LABEL: sgemm_inner_loop_read2_sequence: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s2, s2, 2 -; GFX9-NEXT: s_add_i32 s3, s2, 0xc20 -; GFX9-NEXT: s_addk_i32 s2, 0xc60 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NEXT: ds_read2_b32 v[2:3], v0 offset1:1 -; GFX9-NEXT: ds_read2_b32 v[4:5], v4 offset1:1 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 2, v1 -; GFX9-NEXT: ds_read2_b32 v[0:1], v8 offset1:1 -; GFX9-NEXT: ds_read2_b32 v[6:7], v8 offset0:32 offset1:33 -; GFX9-NEXT: ds_read2_b32 v[8:9], v8 offset0:64 offset1:65 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX9-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX9-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v6 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v7 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v8 -; GFX9-NEXT: v_mov_b32_e32 v10, 0 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v9 -; GFX9-NEXT: global_store_dword v10, v0, s[0:1] -; GFX9-NEXT: s_endpgm +; GFX9-NEXT: s_lshl_b32 s2, s2, 2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_add_u32_e32 v0, 0xc00, v0 +; GFX9-NEXT: ds_read2_b32 v[2:3], v0 offset0:8 offset1:9 +; GFX9-NEXT: ds_read2_b32 v[4:5], v0 offset0:24 offset1:25 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 2, v1 +; GFX9-NEXT: ds_read2_b32 v[0:1], v8 offset1:1 +; GFX9-NEXT: ds_read2_b32 v[6:7], v8 offset0:32 offset1:33 +; GFX9-NEXT: ds_read2_b32 v[8:9], v8 offset0:64 offset1:65 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX9-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX9-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, v0, v6 +; GFX9-NEXT: v_add_f32_e32 v0, v0, v7 +; GFX9-NEXT: v_add_f32_e32 v0, v0, v8 +; GFX9-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-NEXT: v_add_f32_e32 v0, v0, v9 +; GFX9-NEXT: global_store_dword v10, v0, s[0:1] +; GFX9-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1 %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1 %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i @@ -1257,34 +1297,34 @@ define amdgpu_kernel void @ds_read_diff_base_interleaving( ; CI-LABEL: ds_read_diff_base_interleaving: ; CI: ; %bb.0: ; %bb -; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb -; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v1 -; CI-NEXT: v_add_i32_e32 v4, vcc, s1, v0 -; CI-NEXT: v_add_i32_e32 v3, vcc, s2, v1 -; CI-NEXT: v_add_i32_e32 v6, vcc, s3, v0 -; CI-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 -; CI-NEXT: ds_read2_b32 v[2:3], v3 offset1:1 -; CI-NEXT: ds_read2_b32 v[4:5], v4 offset1:4 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mul_f32_e32 v0, v0, v4 -; CI-NEXT: v_add_f32_e32 v4, 2.0, v0 -; CI-NEXT: v_mul_f32_e32 v5, v1, v5 -; CI-NEXT: ds_read2_b32 v[0:1], v6 offset1:4 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mul_f32_e32 v0, v2, v0 -; CI-NEXT: v_sub_f32_e32 v0, v4, v0 -; CI-NEXT: v_sub_f32_e32 v0, v0, v5 -; CI-NEXT: v_mul_f32_e32 v1, v3, v1 -; CI-NEXT: v_sub_f32_e32 v0, v0, v1 -; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:40 -; CI-NEXT: s_endpgm +; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb +; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v1 +; CI-NEXT: v_add_i32_e32 v3, vcc, s1, v0 +; CI-NEXT: v_add_i32_e32 v4, vcc, s2, v1 +; CI-NEXT: v_add_i32_e32 v5, vcc, s3, v0 +; CI-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 +; CI-NEXT: ds_read2_b32 v[2:3], v3 offset1:4 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mul_f32_e32 v0, v0, v2 +; CI-NEXT: v_add_f32_e32 v6, 2.0, v0 +; CI-NEXT: v_mul_f32_e32 v7, v1, v3 +; CI-NEXT: ds_read2_b32 v[0:1], v4 offset1:1 +; CI-NEXT: ds_read2_b32 v[2:3], v5 offset1:4 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mul_f32_e32 v0, v0, v2 +; CI-NEXT: v_sub_f32_e32 v0, v6, v0 +; CI-NEXT: v_sub_f32_e32 v0, v0, v7 +; CI-NEXT: v_mul_f32_e32 v1, v1, v3 +; CI-NEXT: v_sub_f32_e32 v0, v0, v1 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:40 +; CI-NEXT: s_endpgm ; ; GFX9-LABEL: ds_read_diff_base_interleaving: ; GFX9: ; %bb.0: ; %bb @@ -1521,13 +1561,29 @@ ; ; GFX9-UNALIGNED-LABEL: read2_v2i32_align1_odd_offset: ; GFX9-UNALIGNED: ; %bb.0: ; %entry -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x41 -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9-UNALIGNED-NEXT: s_endpgm +; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-UNALIGNED-NEXT: ds_read_u8 v0, v2 offset:65 +; GFX9-UNALIGNED-NEXT: ds_read_u8 v3, v2 offset:66 +; GFX9-UNALIGNED-NEXT: ds_read_u8 v4, v2 offset:67 +; GFX9-UNALIGNED-NEXT: ds_read_u8 v5, v2 offset:68 +; GFX9-UNALIGNED-NEXT: ds_read_u8 v1, v2 offset:69 +; GFX9-UNALIGNED-NEXT: ds_read_u8 v6, v2 offset:70 +; GFX9-UNALIGNED-NEXT: ds_read_u8 v7, v2 offset:71 +; GFX9-UNALIGNED-NEXT: ds_read_u8 v8, v2 offset:72 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX9-UNALIGNED-NEXT: v_or_b32_e32 v1, v6, v1 +; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 8, v8 +; GFX9-UNALIGNED-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v5 +; GFX9-UNALIGNED-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-UNALIGNED-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-UNALIGNED-NEXT: v_or_b32_e32 v1, v6, v1 +; GFX9-UNALIGNED-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX9-UNALIGNED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-UNALIGNED-NEXT: s_endpgm entry: %load = load <2 x i32>, <2 x i32> addrspace(3)* bitcast (i8 addrspace(3)* getelementptr (i8, i8 addrspace(3)* bitcast ([100 x <2 x i32>] addrspace(3)* @v2i32_align1 to i8 addrspace(3)*), i32 65) to <2 x i32> addrspace(3)*), align 1 store <2 x i32> %load, <2 x i32> addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -837,12 +837,17 @@ ; ; GFX9-UNALIGNED-LABEL: store_misaligned64_constant_offsets: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x7b -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-UNALIGNED-NEXT: ds_write_b128 v1, v[0:3] -; GFX9-UNALIGNED-NEXT: s_endpgm +; GFX9-UNALIGNED-NEXT: s_movk_i32 s0, 0x7b +; GFX9-UNALIGNED-NEXT: s_mov_b32 s1, 0 +; GFX9-UNALIGNED-NEXT: s_mov_b32 s2, s0 +; GFX9-UNALIGNED-NEXT: s_mov_b32 s3, s1 +; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 +; GFX9-UNALIGNED-NEXT: s_endpgm store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 ret void @@ -993,18 +998,18 @@ ; ; GFX9-UNALIGNED-LABEL: simple_write2_v4f32_superreg_align4: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_lshl_add_u32 v4, v0, 4, s4 -; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-UNALIGNED-NEXT: ds_write_b128 v4, v[0:3] -; GFX9-UNALIGNED-NEXT: s_endpgm +; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-UNALIGNED-NEXT: v_lshl_add_u32 v4, v0, 4, s4 +; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 +; GFX9-UNALIGNED-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in %val0 = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/store-local.128.ll --- a/llvm/test/CodeGen/AMDGPU/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.128.ll @@ -316,18 +316,18 @@ ; ; GFX6-LABEL: store_lds_v4i32_align4: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_write2_b32 v0, v2, v1 offset1:1 -; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: ds_write2_b32 v0, v2, v1 offset0:2 offset1:3 -; GFX6-NEXT: s_endpgm +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v2, s1 +; GFX6-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v2, s3 +; GFX6-NEXT: ds_write2_b32 v0, v1, v2 offset0:2 offset1:3 +; GFX6-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 4 ret void } @@ -335,44 +335,44 @@ define amdgpu_kernel void @store_lds_v4i32_align8(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 -; GFX9-NEXT: s_endpgm +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 +; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align8: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 -; GFX7-NEXT: s_endpgm +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 +; GFX7-NEXT: s_endpgm ; ; GFX6-LABEL: store_lds_v4i32_align8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: v_mov_b32_e32 v3, s1 -; GFX6-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset1:1 -; GFX6-NEXT: s_endpgm +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v4, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 +; GFX6-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 8 ret void }