diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2994,12 +2994,11 @@ // the bytes again are not eliminated in the case of an unaligned copy. if (!allowsMisalignedMemoryAccesses( VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) { - SDValue Ops[2]; - if (VT.isVector()) - std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(LN, DAG); - else - std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG); + return SplitVectorLoad(SDValue(LN, 0), DAG); + + SDValue Ops[2]; + std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG); return DAG.getMergeValues(Ops, SDLoc(N)); } @@ -3050,7 +3049,7 @@ if (!allowsMisalignedMemoryAccesses( VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) { if (VT.isVector()) - return scalarizeVectorStore(SN, DAG); + return SplitVectorStore(SDValue(SN, 0), DAG); return expandUnalignedStore(SN, DAG); } diff --git a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll --- a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll @@ -395,22 +395,22 @@ ; ALIGNED-SDAG-NEXT: ds_read_u8 v11, v0 offset:10 ; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:11 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v12, s1 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) +; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v5 offset:4 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) +; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v6 offset:5 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) -; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v9 offset:8 +; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v11 offset:10 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) +; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v0 offset:11 +; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v9 offset:8 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v10 offset:9 -; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v3 offset:2 -; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v4 offset:3 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v1 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v2 offset:1 -; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v5 offset:4 -; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v6 offset:5 +; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v3 offset:2 +; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v4 offset:3 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v7 offset:6 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v8 offset:7 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(11) -; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v11 offset:10 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(11) -; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v0 offset:11 ; ALIGNED-SDAG-NEXT: s_endpgm ; ; ALIGNED-GISEL-LABEL: ds12align1: @@ -492,24 +492,23 @@ ; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:8 -; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 -; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2 -; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:4 -; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:6 +; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 +; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 offset:2 +; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:4 +; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:10 +; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:8 +; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:6 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v6, s1 -; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:10 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) -; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v1 offset:8 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(4) -; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v3 offset:2 -; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v2 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) -; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v4 offset:4 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) -; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v5 offset:6 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) +; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v3 offset:4 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) +; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v4 offset:10 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) +; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v5 offset:8 +; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v1 +; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v2 offset:2 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) -; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v0 offset:10 +; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v0 offset:6 ; ALIGNED-SDAG-NEXT: s_endpgm ; ; ALIGNED-GISEL-LABEL: ds12align2: @@ -589,14 +588,14 @@ ; ALIGNED-SDAG: ; %bb.0: ; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; ALIGNED-SDAG-NEXT: ds_read_b32 v2, v0 offset:8 -; ALIGNED-SDAG-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; ALIGNED-SDAG-NEXT: ds_read_b64 v[0:1], v2 +; ALIGNED-SDAG-NEXT: ds_read_b32 v2, v2 offset:8 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) -; ALIGNED-SDAG-NEXT: ds_write_b32 v3, v2 offset:8 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) ; ALIGNED-SDAG-NEXT: ds_write_b64 v3, v[0:1] +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) +; ALIGNED-SDAG-NEXT: ds_write_b32 v3, v2 offset:8 ; ALIGNED-SDAG-NEXT: s_endpgm ; ; ALIGNED-GISEL-LABEL: ds12align8: @@ -667,25 +666,30 @@ ; ALIGNED-SDAG-NEXT: ds_read_u8 v15, v0 offset:14 ; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:15 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v16, s1 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) -; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v13 offset:12 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) -; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v14 offset:13 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v3 offset:2 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v4 offset:3 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v1 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v2 offset:1 -; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v5 offset:4 -; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v6 offset:5 -; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v9 offset:8 -; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v10 offset:9 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v7 offset:6 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v8 offset:7 +; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v5 offset:4 +; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v6 offset:5 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v11 offset:10 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v12 offset:11 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(14) +; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v9 offset:8 +; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v10 offset:9 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v15 offset:14 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v0 offset:15 +; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v13 offset:12 +; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v14 offset:13 ; ALIGNED-SDAG-NEXT: s_endpgm ; ; ALIGNED-GISEL-LABEL: ds16align1: @@ -783,29 +787,27 @@ ; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:12 ; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 ; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2 ; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:4 ; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:6 ; ALIGNED-SDAG-NEXT: ds_read_u16 v6, v0 offset:8 ; ALIGNED-SDAG-NEXT: ds_read_u16 v7, v0 offset:10 -; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v8, s1 +; ALIGNED-SDAG-NEXT: ds_read_u16 v8, v0 offset:12 ; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:14 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) -; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v1 offset:12 +; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(6) -; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v3 offset:2 -; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v2 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) -; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v4 offset:4 +; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v3 offset:2 +; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v2 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(6) -; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v6 offset:8 -; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v5 offset:6 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) -; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v7 offset:10 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) -; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v0 offset:14 +; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v5 offset:6 +; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v4 offset:4 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(6) +; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v7 offset:10 +; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v6 offset:8 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(6) +; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v0 offset:14 +; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v8 offset:12 ; ALIGNED-SDAG-NEXT: s_endpgm ; ; ALIGNED-GISEL-LABEL: ds16align2: @@ -856,33 +858,19 @@ } define amdgpu_kernel void @ds16align4(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) { -; ALIGNED-SDAG-LABEL: ds16align4: -; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; ALIGNED-SDAG-NEXT: ds_read2_b32 v[0:1], v2 offset0:2 offset1:3 -; ALIGNED-SDAG-NEXT: ds_read2_b32 v[2:3], v2 offset1:1 -; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) -; ALIGNED-SDAG-NEXT: ds_write2_b32 v4, v0, v1 offset0:2 offset1:3 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) -; ALIGNED-SDAG-NEXT: ds_write2_b32 v4, v2, v3 offset1:1 -; ALIGNED-SDAG-NEXT: s_endpgm -; -; ALIGNED-GISEL-LABEL: ds16align4: -; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s0 -; ALIGNED-GISEL-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 -; ALIGNED-GISEL-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 -; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1 -; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) -; ALIGNED-GISEL-NEXT: ds_write2_b32 v4, v0, v1 offset1:1 -; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) -; ALIGNED-GISEL-NEXT: ds_write2_b32 v4, v2, v3 offset0:2 offset1:3 -; ALIGNED-GISEL-NEXT: s_endpgm +; ALIGNED-LABEL: ds16align4: +; ALIGNED: ; %bb.0: +; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-NEXT: v_mov_b32_e32 v2, s0 +; ALIGNED-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 +; ALIGNED-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 +; ALIGNED-NEXT: v_mov_b32_e32 v4, s1 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(1) +; ALIGNED-NEXT: ds_write2_b32 v4, v0, v1 offset1:1 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(1) +; ALIGNED-NEXT: ds_write2_b32 v4, v2, v3 offset0:2 offset1:3 +; ALIGNED-NEXT: s_endpgm ; ; UNALIGNED-LABEL: ds16align4: ; UNALIGNED: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/load-local.96.ll --- a/llvm/test/CodeGen/AMDGPU/load-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local.96.ll @@ -352,7 +352,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; GFX9-NEXT: ds_read_b64 v[0:1], v0 ; GFX9-NEXT: ds_read_b32 v2, v2 offset:8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -362,7 +362,7 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; GFX7-NEXT: ds_read_b64 v[0:1], v0 ; GFX7-NEXT: ds_read_b32 v2, v2 offset:8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -370,12 +370,11 @@ ; GFX6-LABEL: load_lds_v3i32_align8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v2 +; GFX6-NEXT: ds_read_b64 v[0:1], v0 ; GFX6-NEXT: ds_read_b32 v2, v2 -; GFX6-NEXT: ds_read_b32 v0, v0 -; GFX6-NEXT: ds_read_b32 v1, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -384,7 +383,7 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; GFX10-NEXT: ds_read_b64 v[0:1], v0 ; GFX10-NEXT: ds_read_b32 v2, v2 offset:8 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/store-local.128.ll --- a/llvm/test/CodeGen/AMDGPU/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.128.ll @@ -70,42 +70,42 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:12 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:14 -; GFX9-NEXT: ds_write_b8 v0, v2 offset:8 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:10 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:4 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:8 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:10 +; GFX9-NEXT: ds_write_b8 v0, v2 offset:12 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:14 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: s_lshr_b32 s0, s7, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s7, 24 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:13 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: s_lshr_b32 s0, s6, 8 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:15 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:4 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_lshr_b32 s0, s6, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:9 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s5, 8 +; GFX9-NEXT: s_lshr_b32 s0, s7, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:11 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s5, 24 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:5 +; GFX9-NEXT: s_lshr_b32 s0, s7, 24 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:13 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_lshr_b32 s0, s4, 8 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:7 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:15 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_lshr_b32 s0, s4, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:1 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s5, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:3 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s5, 24 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:5 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:7 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align1: @@ -115,50 +115,50 @@ ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:12 -; GFX7-NEXT: ds_write_b8 v0, v2 offset:8 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:4 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s3 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:8 +; GFX7-NEXT: ds_write_b8 v0, v2 offset:12 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_lshr_b32 s4, s3, 8 ; GFX7-NEXT: ds_write_b8 v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_lshr_b32 s4, s2, 8 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:4 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_lshr_b32 s4, s3, 24 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:13 -; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_lshr_b32 s3, s3, 16 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:15 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: s_lshr_b32 s3, s2, 8 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:14 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: s_lshr_b32 s3, s2, 24 +; GFX7-NEXT: s_lshr_b32 s4, s2, 24 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:9 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_lshr_b32 s2, s2, 16 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:11 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s2, s1, 8 +; GFX7-NEXT: s_lshr_b32 s2, s3, 8 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:10 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s2, s1, 24 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:5 +; GFX7-NEXT: s_lshr_b32 s2, s3, 24 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:13 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s1, s1, 16 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_lshr_b32 s1, s0, 8 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:6 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_lshr_b32 s1, s0, 24 +; GFX7-NEXT: s_lshr_b32 s2, s3, 16 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:15 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: s_lshr_b32 s2, s0, 8 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:14 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: s_lshr_b32 s2, s0, 24 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:1 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: s_lshr_b32 s0, s0, 16 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:3 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s0, s1, 8 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:2 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s0, s1, 24 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:5 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s0, s1, 16 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:7 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:6 ; GFX7-NEXT: s_endpgm ; ; GFX6-LABEL: store_lds_v4i32_align1: @@ -168,50 +168,50 @@ ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:12 -; GFX6-NEXT: ds_write_b8 v0, v2 offset:8 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:4 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v2, s3 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:8 +; GFX6-NEXT: ds_write_b8 v0, v2 offset:12 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: s_lshr_b32 s4, s3, 8 ; GFX6-NEXT: ds_write_b8 v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_lshr_b32 s4, s2, 8 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:4 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: s_lshr_b32 s4, s3, 24 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:13 -; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: s_lshr_b32 s3, s3, 16 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:15 -; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: s_lshr_b32 s3, s2, 8 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:14 -; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: s_lshr_b32 s3, s2, 24 +; GFX6-NEXT: s_lshr_b32 s4, s2, 24 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:9 -; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_lshr_b32 s2, s2, 16 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:11 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s2, s1, 8 +; GFX6-NEXT: s_lshr_b32 s2, s3, 8 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:10 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s2, s1, 24 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:5 +; GFX6-NEXT: s_lshr_b32 s2, s3, 24 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:13 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s1, s1, 16 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: s_lshr_b32 s1, s0, 8 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:6 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: s_lshr_b32 s1, s0, 24 +; GFX6-NEXT: s_lshr_b32 s2, s3, 16 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:15 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: s_lshr_b32 s2, s0, 8 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:14 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: s_lshr_b32 s2, s0, 24 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:1 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: s_lshr_b32 s0, s0, 16 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:3 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_lshr_b32 s0, s1, 8 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_lshr_b32 s0, s1, 24 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:5 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_lshr_b32 s0, s1, 16 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:7 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:6 ; GFX6-NEXT: s_endpgm ; ; GFX10-LABEL: store_lds_v4i32_align1: @@ -221,42 +221,42 @@ ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-NEXT: s_lshr_b32 s3, s6, 24 -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: s_lshr_b32 s0, s7, 8 -; GFX10-NEXT: s_lshr_b32 s2, s6, 8 -; GFX10-NEXT: s_lshr_b32 s6, s5, 8 -; GFX10-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-NEXT: s_lshr_b32 s1, s7, 24 -; GFX10-NEXT: s_lshr_b32 s5, s5, 24 +; GFX10-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-NEXT: s_lshr_b32 s3, s7, 24 +; GFX10-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-NEXT: s_lshr_b32 s0, s6, 8 +; GFX10-NEXT: s_lshr_b32 s1, s6, 24 +; GFX10-NEXT: s_lshr_b32 s6, s4, 8 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_lshr_b32 s2, s7, 8 +; GFX10-NEXT: s_lshr_b32 s4, s4, 24 ; GFX10-NEXT: v_mov_b32_e32 v8, s3 ; GFX10-NEXT: v_mov_b32_e32 v5, s0 ; GFX10-NEXT: v_mov_b32_e32 v9, s6 -; GFX10-NEXT: s_lshr_b32 s0, s4, 8 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: s_lshr_b32 s0, s5, 8 +; GFX10-NEXT: v_mov_b32_e32 v4, s5 ; GFX10-NEXT: v_mov_b32_e32 v6, s1 ; GFX10-NEXT: v_mov_b32_e32 v7, s2 -; GFX10-NEXT: ds_write_b8 v0, v1 offset:12 -; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:14 -; GFX10-NEXT: ds_write_b8 v0, v2 offset:8 -; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:10 -; GFX10-NEXT: ds_write_b8 v0, v3 offset:4 -; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:6 -; GFX10-NEXT: ds_write_b8 v0, v4 -; GFX10-NEXT: ds_write_b8_d16_hi v0, v4 offset:2 -; GFX10-NEXT: ds_write_b8 v0, v5 offset:13 -; GFX10-NEXT: ds_write_b8 v0, v6 offset:15 -; GFX10-NEXT: ds_write_b8 v0, v7 offset:9 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: s_lshr_b32 s1, s4, 24 +; GFX10-NEXT: ds_write_b8 v0, v1 offset:8 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:10 +; GFX10-NEXT: ds_write_b8 v0, v2 offset:12 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:14 +; GFX10-NEXT: ds_write_b8 v0, v3 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:2 +; GFX10-NEXT: ds_write_b8 v0, v4 offset:4 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v4 offset:6 +; GFX10-NEXT: ds_write_b8 v0, v5 offset:9 +; GFX10-NEXT: ds_write_b8 v0, v6 offset:11 +; GFX10-NEXT: ds_write_b8 v0, v7 offset:13 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: s_lshr_b32 s1, s5, 24 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: ds_write_b8 v0, v8 offset:11 -; GFX10-NEXT: ds_write_b8 v0, v9 offset:5 -; GFX10-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX10-NEXT: ds_write_b8 v0, v2 offset:1 -; GFX10-NEXT: ds_write_b8 v0, v3 offset:3 +; GFX10-NEXT: ds_write_b8 v0, v8 offset:15 +; GFX10-NEXT: ds_write_b8 v0, v9 offset:1 +; GFX10-NEXT: ds_write_b8 v0, v1 offset:3 +; GFX10-NEXT: ds_write_b8 v0, v2 offset:5 +; GFX10-NEXT: ds_write_b8 v0, v3 offset:7 ; GFX10-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1 ret void @@ -269,18 +269,18 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: ds_write_b16 v0, v1 offset:12 -; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:14 -; GFX9-NEXT: ds_write_b16 v0, v2 offset:8 -; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:10 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 -; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:6 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: ds_write_b16 v0, v1 offset:8 +; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:10 +; GFX9-NEXT: ds_write_b16 v0, v2 offset:12 +; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:14 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: ds_write_b16 v0, v1 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:2 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:6 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align2: @@ -290,26 +290,26 @@ ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: ds_write_b16 v0, v1 offset:12 -; GFX7-NEXT: ds_write_b16 v0, v2 offset:8 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s3 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:8 +; GFX7-NEXT: ds_write_b16 v0, v2 offset:12 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_lshr_b32 s3, s3, 16 ; GFX7-NEXT: ds_write_b16 v0, v1 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_lshr_b32 s2, s2, 16 -; GFX7-NEXT: ds_write_b16 v0, v1 offset:14 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:4 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s1, s1, 16 +; GFX7-NEXT: s_lshr_b32 s2, s3, 16 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:10 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: s_lshr_b32 s0, s0, 16 -; GFX7-NEXT: ds_write_b16 v0, v1 offset:6 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:14 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s0, s1, 16 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:2 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:6 ; GFX7-NEXT: s_endpgm ; ; GFX6-LABEL: store_lds_v4i32_align2: @@ -319,26 +319,26 @@ ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: ds_write_b16 v0, v1 offset:12 -; GFX6-NEXT: ds_write_b16 v0, v2 offset:8 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v2, s3 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:8 +; GFX6-NEXT: ds_write_b16 v0, v2 offset:12 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: s_lshr_b32 s3, s3, 16 ; GFX6-NEXT: ds_write_b16 v0, v1 -; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_lshr_b32 s2, s2, 16 -; GFX6-NEXT: ds_write_b16 v0, v1 offset:14 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:4 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s1, s1, 16 +; GFX6-NEXT: s_lshr_b32 s2, s3, 16 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:10 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: s_lshr_b32 s0, s0, 16 -; GFX6-NEXT: ds_write_b16 v0, v1 offset:6 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:14 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_lshr_b32 s0, s1, 16 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:6 ; GFX6-NEXT: s_endpgm ; ; GFX10-LABEL: store_lds_v4i32_align2: @@ -348,18 +348,18 @@ ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: ds_write_b16 v0, v1 offset:12 -; GFX10-NEXT: ds_write_b16_d16_hi v0, v1 offset:14 -; GFX10-NEXT: ds_write_b16 v0, v2 offset:8 -; GFX10-NEXT: ds_write_b16_d16_hi v0, v2 offset:10 -; GFX10-NEXT: ds_write_b16 v0, v3 offset:4 -; GFX10-NEXT: ds_write_b16_d16_hi v0, v3 offset:6 -; GFX10-NEXT: ds_write_b16 v0, v4 -; GFX10-NEXT: ds_write_b16_d16_hi v0, v4 offset:2 +; GFX10-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: v_mov_b32_e32 v4, s5 +; GFX10-NEXT: ds_write_b16 v0, v1 offset:8 +; GFX10-NEXT: ds_write_b16_d16_hi v0, v1 offset:10 +; GFX10-NEXT: ds_write_b16 v0, v2 offset:12 +; GFX10-NEXT: ds_write_b16_d16_hi v0, v2 offset:14 +; GFX10-NEXT: ds_write_b16 v0, v3 +; GFX10-NEXT: ds_write_b16_d16_hi v0, v3 offset:2 +; GFX10-NEXT: ds_write_b16 v0, v4 offset:4 +; GFX10-NEXT: ds_write_b16_d16_hi v0, v4 offset:6 ; GFX10-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 2 ret void @@ -402,12 +402,12 @@ ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: v_mov_b32_e32 v2, s3 ; GFX6-NEXT: ds_write2_b32 v0, v1, v2 offset0:2 offset1:3 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v2, s1 +; GFX6-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 ; GFX6-NEXT: s_endpgm ; ; GFX10-LABEL: store_lds_v4i32_align4: diff --git a/llvm/test/CodeGen/AMDGPU/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/store-local.96.ll --- a/llvm/test/CodeGen/AMDGPU/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.96.ll @@ -68,32 +68,32 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:8 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:10 -; GFX9-NEXT: ds_write_b8 v0, v2 offset:4 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: ds_write_b8 v0, v2 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:2 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: s_lshr_b32 s0, s6, 8 -; GFX9-NEXT: ds_write_b8 v0, v1 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:4 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_lshr_b32 s0, s6, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:9 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s5, 8 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:11 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s5, 24 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:5 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_lshr_b32 s0, s4, 8 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:7 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:11 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_lshr_b32 s0, s4, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:1 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s5, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:3 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s5, 24 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:5 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:7 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align1: @@ -104,12 +104,12 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:8 -; GFX7-NEXT: ds_write_b8 v0, v2 offset:4 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b8 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_lshr_b32 s3, s2, 8 -; GFX7-NEXT: ds_write_b8 v0, v1 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:4 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_lshr_b32 s3, s2, 24 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:9 @@ -117,25 +117,25 @@ ; GFX7-NEXT: s_lshr_b32 s2, s2, 16 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:11 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s2, s1, 8 +; GFX7-NEXT: s_lshr_b32 s2, s0, 8 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:10 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s2, s1, 24 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:5 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s1, s1, 16 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_lshr_b32 s1, s0, 8 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:6 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_lshr_b32 s1, s0, 24 +; GFX7-NEXT: s_lshr_b32 s2, s0, 24 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:1 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: s_lshr_b32 s0, s0, 16 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:3 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s0, s1, 8 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:2 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s0, s1, 24 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:5 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s0, s1, 16 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:7 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:6 ; GFX7-NEXT: s_endpgm ; ; GFX6-LABEL: store_lds_v3i32_align1: @@ -146,12 +146,12 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:8 -; GFX6-NEXT: ds_write_b8 v0, v2 offset:4 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_write_b8 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_lshr_b32 s3, s2, 8 -; GFX6-NEXT: ds_write_b8 v0, v1 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:4 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_lshr_b32 s3, s2, 24 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:9 @@ -159,25 +159,25 @@ ; GFX6-NEXT: s_lshr_b32 s2, s2, 16 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:11 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s2, s1, 8 +; GFX6-NEXT: s_lshr_b32 s2, s0, 8 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:10 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s2, s1, 24 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:5 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s1, s1, 16 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: s_lshr_b32 s1, s0, 8 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:6 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: s_lshr_b32 s1, s0, 24 +; GFX6-NEXT: s_lshr_b32 s2, s0, 24 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:1 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: s_lshr_b32 s0, s0, 16 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:3 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_lshr_b32 s0, s1, 8 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_lshr_b32 s0, s1, 24 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:5 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_lshr_b32 s0, s1, 16 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:7 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:6 ; GFX6-NEXT: s_endpgm ; ; GFX10-LABEL: store_lds_v3i32_align1: @@ -188,32 +188,32 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_mov_b32_e32 v3, s5 ; GFX10-NEXT: s_lshr_b32 s0, s6, 8 ; GFX10-NEXT: s_lshr_b32 s1, s6, 24 -; GFX10-NEXT: s_lshr_b32 s2, s5, 8 -; GFX10-NEXT: s_lshr_b32 s3, s5, 24 -; GFX10-NEXT: s_lshr_b32 s5, s4, 8 -; GFX10-NEXT: s_lshr_b32 s4, s4, 24 +; GFX10-NEXT: s_lshr_b32 s2, s4, 8 +; GFX10-NEXT: s_lshr_b32 s3, s4, 24 +; GFX10-NEXT: s_lshr_b32 s4, s5, 8 +; GFX10-NEXT: s_lshr_b32 s5, s5, 24 ; GFX10-NEXT: v_mov_b32_e32 v4, s0 ; GFX10-NEXT: v_mov_b32_e32 v5, s1 ; GFX10-NEXT: v_mov_b32_e32 v6, s2 ; GFX10-NEXT: v_mov_b32_e32 v7, s3 -; GFX10-NEXT: v_mov_b32_e32 v8, s5 -; GFX10-NEXT: v_mov_b32_e32 v9, s4 +; GFX10-NEXT: v_mov_b32_e32 v8, s4 +; GFX10-NEXT: v_mov_b32_e32 v9, s5 ; GFX10-NEXT: ds_write_b8 v0, v1 offset:8 ; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:10 -; GFX10-NEXT: ds_write_b8 v0, v2 offset:4 -; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 -; GFX10-NEXT: ds_write_b8 v0, v3 -; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:2 +; GFX10-NEXT: ds_write_b8 v0, v2 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:2 +; GFX10-NEXT: ds_write_b8 v0, v3 offset:4 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:6 ; GFX10-NEXT: ds_write_b8 v0, v4 offset:9 ; GFX10-NEXT: ds_write_b8 v0, v5 offset:11 -; GFX10-NEXT: ds_write_b8 v0, v6 offset:5 -; GFX10-NEXT: ds_write_b8 v0, v7 offset:7 -; GFX10-NEXT: ds_write_b8 v0, v8 offset:1 -; GFX10-NEXT: ds_write_b8 v0, v9 offset:3 +; GFX10-NEXT: ds_write_b8 v0, v6 offset:1 +; GFX10-NEXT: ds_write_b8 v0, v7 offset:3 +; GFX10-NEXT: ds_write_b8 v0, v8 offset:5 +; GFX10-NEXT: ds_write_b8 v0, v9 offset:7 ; GFX10-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1 ret void @@ -227,14 +227,14 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: ds_write_b16 v0, v1 offset:8 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:10 -; GFX9-NEXT: ds_write_b16 v0, v2 offset:4 -; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: ds_write_b16 v0, v1 -; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:2 +; GFX9-NEXT: ds_write_b16 v0, v2 +; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:2 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:6 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align2: @@ -245,20 +245,20 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:8 -; GFX7-NEXT: ds_write_b16 v0, v2 offset:4 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b16 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_lshr_b32 s2, s2, 16 -; GFX7-NEXT: ds_write_b16 v0, v1 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:4 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s1, s1, 16 -; GFX7-NEXT: ds_write_b16 v0, v1 offset:10 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_lshr_b32 s0, s0, 16 -; GFX7-NEXT: ds_write_b16 v0, v1 offset:6 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:10 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s0, s1, 16 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:2 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:6 ; GFX7-NEXT: s_endpgm ; ; GFX6-LABEL: store_lds_v3i32_align2: @@ -269,20 +269,20 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:8 -; GFX6-NEXT: ds_write_b16 v0, v2 offset:4 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_write_b16 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_lshr_b32 s2, s2, 16 -; GFX6-NEXT: ds_write_b16 v0, v1 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:4 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s1, s1, 16 -; GFX6-NEXT: ds_write_b16 v0, v1 offset:10 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_lshr_b32 s0, s0, 16 -; GFX6-NEXT: ds_write_b16 v0, v1 offset:6 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:10 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_lshr_b32 s0, s1, 16 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:6 ; GFX6-NEXT: s_endpgm ; ; GFX10-LABEL: store_lds_v3i32_align2: @@ -293,14 +293,14 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_mov_b32_e32 v3, s5 ; GFX10-NEXT: ds_write_b16 v0, v1 offset:8 ; GFX10-NEXT: ds_write_b16_d16_hi v0, v1 offset:10 -; GFX10-NEXT: ds_write_b16 v0, v2 offset:4 -; GFX10-NEXT: ds_write_b16_d16_hi v0, v2 offset:6 -; GFX10-NEXT: ds_write_b16 v0, v3 -; GFX10-NEXT: ds_write_b16_d16_hi v0, v3 offset:2 +; GFX10-NEXT: ds_write_b16 v0, v2 +; GFX10-NEXT: ds_write_b16_d16_hi v0, v2 offset:2 +; GFX10-NEXT: ds_write_b16 v0, v3 offset:4 +; GFX10-NEXT: ds_write_b16_d16_hi v0, v3 offset:6 ; GFX10-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 2 ret void @@ -341,11 +341,11 @@ ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 offset:8 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_write2_b32 v0, v2, v1 offset1:1 ; GFX6-NEXT: s_endpgm ; ; GFX10-LABEL: store_lds_v3i32_align4: