diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td --- a/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/llvm/lib/Target/AMDGPU/SISchedule.td @@ -93,16 +93,16 @@ let BufferSize = 1; } def HWExport : ProcResource<1> { - let BufferSize = 7; // Taken from S_WAITCNT + let BufferSize = 1; } def HWLGKM : ProcResource<1> { - let BufferSize = 31; // Taken from S_WAITCNT + let BufferSize = 1; } def HWSALU : ProcResource<1> { let BufferSize = 1; } def HWVMEM : ProcResource<1> { - let BufferSize = 15; // Taken from S_WAITCNT + let BufferSize = 1; } def HWVALU : ProcResource<1> { let BufferSize = 1; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll @@ -452,35 +452,35 @@ define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: load_i8_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_i8_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -527,19 +527,18 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_unaligned: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1 -; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2 -; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:3 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:1 +; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2 +; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[4:7], 0 addr64 offset:3 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2 ; SI-NEXT: s_waitcnt vmcnt(2) @@ -548,17 +547,18 @@ ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v5 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v4i8_to_v4f32_unaligned: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 @@ -571,8 +571,8 @@ ; VI-NEXT: flat_load_ubyte v1, v[2:3] ; VI-NEXT: flat_load_ubyte v2, v[4:5] ; VI-NEXT: flat_load_ubyte v3, v[6:7] -; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -628,39 +628,39 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: i8_zext_inreg_i32_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 2, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: i8_zext_inreg_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -676,37 +676,37 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: i8_zext_inreg_hi1_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfe_u32 v0, v0, 8, 8 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: i8_zext_inreg_hi1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -724,35 +724,35 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: i8_zext_i32_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: i8_zext_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -767,19 +767,18 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: v4i8_zext_v4i32_to_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1 -; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2 -; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:3 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:1 +; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2 +; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[4:7], 0 addr64 offset:3 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2 ; SI-NEXT: s_waitcnt vmcnt(2) @@ -788,17 +787,18 @@ ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v5 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v4i8_zext_v4i32_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 @@ -811,8 +811,8 @@ ; VI-NEXT: flat_load_ubyte v1, v[2:3] ; VI-NEXT: flat_load_ubyte v2, v[4:5] ; VI-NEXT: flat_load_ubyte v3, v[6:7] -; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -835,36 +835,36 @@ define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: extract_byte0_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: extract_byte0_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -879,37 +879,37 @@ define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: extract_byte1_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfe_u32 v0, v0, 8, 8 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: extract_byte1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -925,37 +925,37 @@ define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: extract_byte2_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfe_u32 v0, v0, 16, 8 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: extract_byte2_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -971,36 +971,36 @@ define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: extract_byte3_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: extract_byte3_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte3_e32 v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll @@ -141,16 +141,16 @@ ; CHECK-NEXT: s_cmp_lg_u32 s4, 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB4_4 ; CHECK-NEXT: ; %bb.1: ; %bb2 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, const.ptr@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, const.ptr@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, const.ptr@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, const.ptr@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_mov_b32 s4, -1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_load_dword v0, v0, s[6:7] +; CHECK-NEXT: global_load_dword v0, v0, s[4:5] +; CHECK-NEXT: s_mov_b32 s4, -1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, 1.0, v0 ; CHECK-NEXT: s_cbranch_vccnz .LBB4_3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll @@ -126,58 +126,58 @@ ; GFX9-LABEL: extractelement_vgpr_v4i128_vgpr_idx: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 1, v2 -; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off -; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 -; GFX9-NEXT: v_add_u32_e32 v17, 1, v16 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 6, v16 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 7, v16 +; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX9-NEXT: v_add_u32_e32 v16, 1, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 6, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 7, v2 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e64 v10, v2, v4, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v11, v3, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v12, v4, v6, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v4, v10, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v11, v7, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v12, v8, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc ; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32 ; GFX9-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 7, v17 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 7, v16 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v17 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v11, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v13, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v14, s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v15, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v14, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v15, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v3, v14, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v4, v15, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v4i128_vgpr_idx: @@ -248,58 +248,58 @@ ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_lshlrev_b32_e32 v16, 1, v2 -; GFX7-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[8:11], 0 addr64 offset:16 -; GFX7-NEXT: v_add_i32_e32 v17, vcc, 1, v16 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 6, v16 +; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:16 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX7-NEXT: v_add_i32_e32 v16, vcc, 1, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 6, v2 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_cndmask_b32_e64 v10, v2, v4, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e64 v11, v3, v5, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v12, v4, v6, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v4, v10, v6, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v5, v11, v7, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v17 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v12, v8, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16 ; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 ; GFX7-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v17 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc ; GFX7-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:32 ; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:48 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 7, v17 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 7, v16 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 7, v16 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 7, v2 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v8, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 4, v17 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 5, v17 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 6, v17 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v11, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 6, v16 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[6:7] ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v13, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v14, s[8:9] ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v15, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v14, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v15, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v2, v3, v14, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v4, v15, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v4i128_vgpr_idx: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll @@ -764,28 +764,28 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GCN-NEXT: s_mov_b32 s7, 0x80008 -; GCN-NEXT: s_movk_i32 s5, 0xff +; GCN-NEXT: s_movk_i32 s2, 0xff ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bfe_u32 s8, s0, s7 -; GCN-NEXT: s_and_b32 s6, s0, s5 +; GCN-NEXT: s_and_b32 s6, s0, s2 ; GCN-NEXT: s_lshl_b32 s8, s8, 8 ; GCN-NEXT: s_or_b32 s6, s6, s8 ; GCN-NEXT: s_mov_b32 s8, 0x80010 -; GCN-NEXT: s_lshr_b32 s2, s0, 24 +; GCN-NEXT: s_lshr_b32 s3, s0, 24 ; GCN-NEXT: s_bfe_u32 s0, s0, s8 ; GCN-NEXT: s_lshl_b32 s0, s0, 16 ; GCN-NEXT: s_or_b32 s0, s6, s0 -; GCN-NEXT: s_lshl_b32 s2, s2, 24 -; GCN-NEXT: s_or_b32 s0, s0, s2 -; GCN-NEXT: s_and_b32 s2, s1, s5 -; GCN-NEXT: s_bfe_u32 s5, s1, s7 -; GCN-NEXT: s_lshr_b32 s3, s1, 24 -; GCN-NEXT: s_lshl_b32 s5, s5, 8 +; GCN-NEXT: s_lshl_b32 s3, s3, 24 +; GCN-NEXT: s_or_b32 s0, s0, s3 +; GCN-NEXT: s_bfe_u32 s3, s1, s7 +; GCN-NEXT: s_lshr_b32 s5, s1, 24 +; GCN-NEXT: s_and_b32 s2, s1, s2 +; GCN-NEXT: s_lshl_b32 s3, s3, 8 ; GCN-NEXT: s_bfe_u32 s1, s1, s8 -; GCN-NEXT: s_or_b32 s2, s2, s5 +; GCN-NEXT: s_or_b32 s2, s2, s3 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_or_b32 s1, s2, s1 -; GCN-NEXT: s_lshl_b32 s2, s3, 24 +; GCN-NEXT: s_lshl_b32 s2, s5, 24 ; GCN-NEXT: s_or_b32 s1, s1, s2 ; GCN-NEXT: s_lshr_b32 s2, s4, 2 ; GCN-NEXT: s_cmp_eq_u32 s2, 1 @@ -1094,30 +1094,30 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GCN-NEXT: s_mov_b32 s6, 0x80008 -; GCN-NEXT: s_movk_i32 s4, 0xff +; GCN-NEXT: s_movk_i32 s2, 0xff ; GCN-NEXT: v_lshrrev_b32_e32 v1, 2, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bfe_u32 s7, s0, s6 -; GCN-NEXT: s_and_b32 s5, s0, s4 +; GCN-NEXT: s_and_b32 s5, s0, s2 ; GCN-NEXT: s_lshl_b32 s7, s7, 8 ; GCN-NEXT: s_or_b32 s5, s5, s7 ; GCN-NEXT: s_mov_b32 s7, 0x80010 -; GCN-NEXT: s_lshr_b32 s2, s0, 24 +; GCN-NEXT: s_lshr_b32 s3, s0, 24 ; GCN-NEXT: s_bfe_u32 s0, s0, s7 ; GCN-NEXT: s_lshl_b32 s0, s0, 16 ; GCN-NEXT: s_or_b32 s0, s5, s0 -; GCN-NEXT: s_lshl_b32 s2, s2, 24 -; GCN-NEXT: s_or_b32 s0, s0, s2 -; GCN-NEXT: s_and_b32 s2, s1, s4 -; GCN-NEXT: s_bfe_u32 s4, s1, s6 -; GCN-NEXT: s_lshr_b32 s3, s1, 24 -; GCN-NEXT: s_lshl_b32 s4, s4, 8 +; GCN-NEXT: s_lshl_b32 s3, s3, 24 +; GCN-NEXT: s_or_b32 s0, s0, s3 +; GCN-NEXT: s_bfe_u32 s3, s1, s6 +; GCN-NEXT: s_lshr_b32 s4, s1, 24 +; GCN-NEXT: s_and_b32 s2, s1, s2 +; GCN-NEXT: s_lshl_b32 s3, s3, 8 ; GCN-NEXT: s_bfe_u32 s1, s1, s7 -; GCN-NEXT: s_or_b32 s2, s2, s4 +; GCN-NEXT: s_or_b32 s2, s2, s3 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_or_b32 s1, s2, s1 -; GCN-NEXT: s_lshl_b32 s2, s3, 24 +; GCN-NEXT: s_lshl_b32 s2, s4, 24 ; GCN-NEXT: s_or_b32 s1, s1, s2 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: v_mov_b32_e32 v3, s1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -2748,25 +2748,25 @@ ; GPRIDX-NEXT: runtime_loader_kernel_symbol = 0 ; GPRIDX-NEXT: .end_amd_kernel_code_t ; GPRIDX-NEXT: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GPRIDX-NEXT: s_load_dword s8, s[4:5], 0x8 -; GPRIDX-NEXT: s_mov_b32 s0, 0 -; GPRIDX-NEXT: s_mov_b32 s1, 0x40140000 -; GPRIDX-NEXT: s_mov_b32 s3, 0x40080000 -; GPRIDX-NEXT: s_mov_b32 s2, s0 +; GPRIDX-NEXT: s_mov_b32 s2, 0 +; GPRIDX-NEXT: s_mov_b32 s3, 0x40140000 +; GPRIDX-NEXT: s_mov_b32 s5, 0x40080000 +; GPRIDX-NEXT: s_mov_b32 s4, s2 ; GPRIDX-NEXT: s_waitcnt lgkmcnt(0) ; GPRIDX-NEXT: s_cmp_eq_u32 s8, 1 -; GPRIDX-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 +; GPRIDX-NEXT: s_cselect_b64 s[6:7], 2.0, 1.0 ; GPRIDX-NEXT: s_cmp_eq_u32 s8, 2 -; GPRIDX-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] +; GPRIDX-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] ; GPRIDX-NEXT: s_cmp_eq_u32 s8, 3 -; GPRIDX-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] +; GPRIDX-NEXT: s_cselect_b64 s[4:5], 4.0, s[4:5] ; GPRIDX-NEXT: s_cmp_eq_u32 s8, 4 -; GPRIDX-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] -; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 -; GPRIDX-NEXT: v_mov_b32_e32 v1, s1 +; GPRIDX-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] +; GPRIDX-NEXT: v_mov_b32_e32 v0, s2 +; GPRIDX-NEXT: v_mov_b32_e32 v1, s3 ; GPRIDX-NEXT: v_mov_b32_e32 v2, 0 -; GPRIDX-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GPRIDX-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GPRIDX-NEXT: s_endpgm ; ; MOVREL-LABEL: dyn_extract_v5f64_s_s: @@ -2839,25 +2839,25 @@ ; MOVREL-NEXT: runtime_loader_kernel_symbol = 0 ; MOVREL-NEXT: .end_amd_kernel_code_t ; MOVREL-NEXT: ; %bb.0: ; %entry -; MOVREL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; MOVREL-NEXT: s_load_dword s8, s[4:5], 0x8 -; MOVREL-NEXT: s_mov_b32 s0, 0 -; MOVREL-NEXT: s_mov_b32 s1, 0x40140000 -; MOVREL-NEXT: s_mov_b32 s3, 0x40080000 -; MOVREL-NEXT: s_mov_b32 s2, s0 +; MOVREL-NEXT: s_mov_b32 s2, 0 +; MOVREL-NEXT: s_mov_b32 s3, 0x40140000 +; MOVREL-NEXT: s_mov_b32 s5, 0x40080000 +; MOVREL-NEXT: s_mov_b32 s4, s2 ; MOVREL-NEXT: s_waitcnt lgkmcnt(0) ; MOVREL-NEXT: s_cmp_eq_u32 s8, 1 -; MOVREL-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 +; MOVREL-NEXT: s_cselect_b64 s[6:7], 2.0, 1.0 ; MOVREL-NEXT: s_cmp_eq_u32 s8, 2 -; MOVREL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] +; MOVREL-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] ; MOVREL-NEXT: s_cmp_eq_u32 s8, 3 -; MOVREL-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] +; MOVREL-NEXT: s_cselect_b64 s[4:5], 4.0, s[4:5] ; MOVREL-NEXT: s_cmp_eq_u32 s8, 4 -; MOVREL-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] -; MOVREL-NEXT: v_mov_b32_e32 v0, s0 -; MOVREL-NEXT: v_mov_b32_e32 v2, s6 -; MOVREL-NEXT: v_mov_b32_e32 v1, s1 -; MOVREL-NEXT: v_mov_b32_e32 v3, s7 +; MOVREL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] +; MOVREL-NEXT: v_mov_b32_e32 v0, s2 +; MOVREL-NEXT: v_mov_b32_e32 v3, s1 +; MOVREL-NEXT: v_mov_b32_e32 v1, s3 +; MOVREL-NEXT: v_mov_b32_e32 v2, s0 ; MOVREL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; MOVREL-NEXT: s_endpgm ; @@ -3577,8 +3577,8 @@ ; GPRIDX-NEXT: runtime_loader_kernel_symbol = 0 ; GPRIDX-NEXT: .end_amd_kernel_code_t ; GPRIDX-NEXT: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GPRIDX-NEXT: s_load_dword s2, s[4:5], 0x8 +; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GPRIDX-NEXT: v_mov_b32_e32 v1, 0 ; GPRIDX-NEXT: s_waitcnt lgkmcnt(0) ; GPRIDX-NEXT: s_cmp_eq_u32 s2, 1 @@ -3661,16 +3661,16 @@ ; MOVREL-NEXT: runtime_loader_kernel_symbol = 0 ; MOVREL-NEXT: .end_amd_kernel_code_t ; MOVREL-NEXT: ; %bb.0: ; %entry -; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; MOVREL-NEXT: s_load_dword s2, s[4:5], 0x8 +; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; MOVREL-NEXT: v_mov_b32_e32 v0, s0 ; MOVREL-NEXT: s_cmp_eq_u32 s2, 1 ; MOVREL-NEXT: s_cselect_b32 s3, 2.0, 1.0 ; MOVREL-NEXT: s_cmp_eq_u32 s2, 2 ; MOVREL-NEXT: s_cselect_b32 s3, 0x40400000, s3 ; MOVREL-NEXT: s_cmp_eq_u32 s2, 3 ; MOVREL-NEXT: s_cselect_b32 s2, 4.0, s3 +; MOVREL-NEXT: v_mov_b32_e32 v0, s0 ; MOVREL-NEXT: v_mov_b32_e32 v2, s2 ; MOVREL-NEXT: v_mov_b32_e32 v1, s1 ; MOVREL-NEXT: flat_store_dword v[0:1], v2 @@ -3837,8 +3837,8 @@ ; GPRIDX-NEXT: runtime_loader_kernel_symbol = 0 ; GPRIDX-NEXT: .end_amd_kernel_code_t ; GPRIDX-NEXT: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GPRIDX-NEXT: s_load_dword s6, s[4:5], 0x8 +; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GPRIDX-NEXT: s_mov_b32 s2, 0 ; GPRIDX-NEXT: s_mov_b32 s3, 0x40080000 ; GPRIDX-NEXT: v_mov_b32_e32 v2, 0 @@ -3924,12 +3924,11 @@ ; MOVREL-NEXT: runtime_loader_kernel_symbol = 0 ; MOVREL-NEXT: .end_amd_kernel_code_t ; MOVREL-NEXT: ; %bb.0: ; %entry -; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; MOVREL-NEXT: s_load_dword s6, s[4:5], 0x8 +; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; MOVREL-NEXT: s_mov_b32 s2, 0 ; MOVREL-NEXT: s_mov_b32 s3, 0x40080000 ; MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; MOVREL-NEXT: v_mov_b32_e32 v3, s1 ; MOVREL-NEXT: s_cmp_eq_u32 s6, 1 ; MOVREL-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 ; MOVREL-NEXT: s_cmp_eq_u32 s6, 2 @@ -3937,6 +3936,7 @@ ; MOVREL-NEXT: s_cmp_eq_u32 s6, 3 ; MOVREL-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] ; MOVREL-NEXT: v_mov_b32_e32 v0, s2 +; MOVREL-NEXT: v_mov_b32_e32 v3, s1 ; MOVREL-NEXT: v_mov_b32_e32 v1, s3 ; MOVREL-NEXT: v_mov_b32_e32 v2, s0 ; MOVREL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll @@ -31,33 +31,33 @@ ; VI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v8, 2, v0 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v8 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v8 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v8 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] glc +; VI-NEXT: flat_load_dword v7, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v1, v[2:3] glc +; VI-NEXT: flat_load_dword v2, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[4:5] glc +; VI-NEXT: flat_load_dword v3, v[4:5] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, s1 -; VI-NEXT: v_mov_b32_e32 v6, s0 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v8 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; VI-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 -; VI-NEXT: v_med3_f32 v0, v0, v1, v2 -; VI-NEXT: flat_store_dword v[6:7], v0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_sub_f32_e32 v4, 0x80000000, v7 +; VI-NEXT: v_med3_f32 v2, v4, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: @@ -153,13 +153,13 @@ ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: flat_load_dword v7, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dword v2, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: v_add_u32_e32 v0, vcc, v4, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -19,9 +19,9 @@ define amdgpu_kernel void @buffer_atomic_add_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 @@ -48,14 +48,14 @@ define amdgpu_kernel void @buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) { ; GFX90A-LABEL: buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen glc slc +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen glc slc scc /* unexpected cache policy bit */ ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -69,9 +69,9 @@ define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 @@ -98,9 +98,9 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) { ; GFX90A-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] @@ -119,9 +119,9 @@ define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 @@ -148,9 +148,9 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) { ; GFX90A-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] @@ -169,9 +169,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 @@ -198,9 +198,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) { ; GFX90A-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] @@ -219,9 +219,9 @@ define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 @@ -248,9 +248,9 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) { ; GFX90A-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] @@ -269,9 +269,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 @@ -298,9 +298,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) { ; GFX90A-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] @@ -319,9 +319,9 @@ define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 @@ -348,9 +348,9 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) { ; GFX90A-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -2791,10 +2791,10 @@ ; ; GFX8-LABEL: insertelement_v_v16i16_s_s: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 16, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GFX8-NEXT: flat_load_dwordx4 v[2:5], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1] ; GFX8-NEXT: s_and_b32 s1, s3, 1 ; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s12, s3, 1 @@ -2810,30 +2810,30 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], s12, 5 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], s12, 6 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7 -; GFX8-NEXT: v_mov_b32_e32 v10, 16 -; GFX8-NEXT: v_mov_b32_e32 v11, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_cndmask_b32_e32 v8, v0, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v2, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v3, s[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v4, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v5, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v6, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v7, s[10:11] -; GFX8-NEXT: v_and_b32_e32 v8, s14, v8 -; GFX8-NEXT: v_or_b32_e32 v8, s13, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[10:11] +; GFX8-NEXT: v_and_b32_e32 v0, s14, v0 +; GFX8-NEXT: v_or_b32_e32 v10, s13, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[2:3] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, v10, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v4, v10, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v10, s[2:3] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v10, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v7, v10, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v8, v10, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v9, v10, s[10:11] ; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: v_mov_b32_e32 v10, 16 +; GFX8-NEXT: v_mov_b32_e32 v11, 0 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GFX8-NEXT: s_endpgm @@ -3707,10 +3707,10 @@ ; ; GFX8-LABEL: insertelement_v_v16i16_s_v: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 16, v0 -; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] -; GFX8-NEXT: flat_load_dwordx4 v[7:10], v[7:8] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx4 v[7:10], v[0:1] ; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v2 @@ -3902,10 +3902,10 @@ ; ; GFX8-LABEL: insertelement_v_v16i16_v_s: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 16, v0 -; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] -; GFX8-NEXT: flat_load_dwordx4 v[7:10], v[7:8] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx4 v[7:10], v[0:1] ; GFX8-NEXT: s_and_b32 s1, s2, 1 ; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s12, s2, 1 @@ -4096,10 +4096,10 @@ ; ; GFX8-LABEL: insertelement_v_v16i16_v_v: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 16, v0 -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1] -; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v3 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v3 ; GFX8-NEXT: s_mov_b32 s0, 0xffff diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll @@ -904,27 +904,27 @@ ; ; GFX8-LABEL: insertelement_s_v4i8_v_s: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX8-NEXT: s_movk_i32 s1, 0xff ; GFX8-NEXT: v_mov_b32_e32 v2, 8 ; GFX8-NEXT: v_mov_b32_e32 v3, 16 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008 -; GFX8-NEXT: s_lshr_b32 s2, s1, 24 -; GFX8-NEXT: s_and_b32 s3, s1, s0 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x80008 +; GFX8-NEXT: s_lshr_b32 s2, s0, 24 +; GFX8-NEXT: s_and_b32 s3, s0, s1 ; GFX8-NEXT: s_lshl_b32 s5, s5, 8 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX8-NEXT: s_or_b32 s3, s3, s5 -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_or_b32 s1, s3, s1 +; GFX8-NEXT: s_lshl_b32 s0, s0, 16 +; GFX8-NEXT: s_or_b32 s0, s3, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 24 -; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s2, s4, 3 ; GFX8-NEXT: s_lshl_b32 s2, s2, 3 ; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: s_lshl_b32 s1, s1, s2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: s_andn2_b32 s0, s1, s0 +; GFX8-NEXT: s_andn2_b32 s0, s0, s1 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v0 @@ -941,28 +941,28 @@ ; GFX7-LABEL: insertelement_s_v4i8_v_s: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX7-NEXT: s_movk_i32 s5, 0xff -; GFX7-NEXT: v_and_b32_e32 v0, s5, v0 +; GFX7-NEXT: s_movk_i32 s2, 0xff +; GFX7-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s3, s0, 0x80008 +; GFX7-NEXT: s_bfe_u32 s5, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s1, s0, 24 -; GFX7-NEXT: s_and_b32 s2, s0, s5 -; GFX7-NEXT: s_lshl_b32 s3, s3, 8 +; GFX7-NEXT: s_and_b32 s3, s0, s2 +; GFX7-NEXT: s_lshl_b32 s5, s5, 8 ; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX7-NEXT: s_or_b32 s2, s2, s3 +; GFX7-NEXT: s_or_b32 s3, s3, s5 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 -; GFX7-NEXT: s_or_b32 s0, s2, s0 +; GFX7-NEXT: s_or_b32 s0, s3, s0 ; GFX7-NEXT: s_lshl_b32 s1, s1, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: s_and_b32 s1, s4, 3 ; GFX7-NEXT: s_lshl_b32 s1, s1, 3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s1, v0 -; GFX7-NEXT: s_lshl_b32 s1, s5, s1 +; GFX7-NEXT: s_lshl_b32 s1, s2, s1 ; GFX7-NEXT: s_andn2_b32 s0, s0, s1 ; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s5, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s2, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1053,27 +1053,27 @@ ; ; GFX8-LABEL: insertelement_s_v4i8_s_v: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX8-NEXT: s_movk_i32 s1, 0xff ; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008 -; GFX8-NEXT: s_lshr_b32 s2, s1, 24 -; GFX8-NEXT: s_and_b32 s3, s1, s0 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x80008 +; GFX8-NEXT: s_lshr_b32 s2, s0, 24 +; GFX8-NEXT: s_and_b32 s3, s0, s1 ; GFX8-NEXT: s_lshl_b32 s5, s5, 8 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX8-NEXT: s_or_b32 s3, s3, s5 -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_or_b32 s1, s3, s1 +; GFX8-NEXT: s_lshl_b32 s0, s0, 16 +; GFX8-NEXT: s_or_b32 s0, s3, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 24 -; GFX8-NEXT: s_or_b32 s1, s1, s2 -; GFX8-NEXT: s_and_b32 s2, s4, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s2, s4, s1 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v0, s2 -; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s0 +; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s1 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, s1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_mov_b32_e32 v3, 16 @@ -1091,29 +1091,29 @@ ; GFX7-LABEL: insertelement_s_v4i8_s_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX7-NEXT: s_movk_i32 s5, 0xff +; GFX7-NEXT: s_movk_i32 s2, 0xff ; GFX7-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s3, s0, 0x80008 +; GFX7-NEXT: s_bfe_u32 s5, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s1, s0, 24 -; GFX7-NEXT: s_and_b32 s2, s0, s5 -; GFX7-NEXT: s_lshl_b32 s3, s3, 8 +; GFX7-NEXT: s_and_b32 s3, s0, s2 +; GFX7-NEXT: s_lshl_b32 s5, s5, 8 ; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX7-NEXT: s_or_b32 s2, s2, s3 +; GFX7-NEXT: s_or_b32 s3, s3, s5 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 -; GFX7-NEXT: s_or_b32 s0, s2, s0 +; GFX7-NEXT: s_or_b32 s0, s3, s0 ; GFX7-NEXT: s_lshl_b32 s1, s1, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_and_b32 s1, s4, s5 +; GFX7-NEXT: s_and_b32 s1, s4, s2 ; GFX7-NEXT: v_lshl_b32_e32 v1, s1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v0, s5, v0 +; GFX7-NEXT: v_lshl_b32_e32 v0, s2, v0 ; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s5, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s2, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1204,25 +1204,25 @@ ; ; GFX8-LABEL: insertelement_s_v4i8_v_v: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX8-NEXT: s_movk_i32 s1, 0xff ; GFX8-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 -; GFX8-NEXT: s_lshr_b32 s2, s1, 24 -; GFX8-NEXT: s_and_b32 s3, s1, s0 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX8-NEXT: s_lshr_b32 s2, s0, 24 +; GFX8-NEXT: s_and_b32 s3, s0, s1 ; GFX8-NEXT: s_lshl_b32 s4, s4, 8 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX8-NEXT: s_or_b32 s3, s3, s4 -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_or_b32 s1, s3, s1 +; GFX8-NEXT: s_lshl_b32 s0, s0, 16 +; GFX8-NEXT: s_or_b32 s0, s3, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 24 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s1 +; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX8-NEXT: v_and_b32_e32 v1, s1, v1 +; GFX8-NEXT: v_and_b32_e32 v1, s0, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 8 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 @@ -1241,29 +1241,29 @@ ; GFX7-LABEL: insertelement_s_v4i8_v_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_movk_i32 s2, 0xff ; GFX7-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s3, s0, 0x80008 +; GFX7-NEXT: s_bfe_u32 s4, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s1, s0, 24 -; GFX7-NEXT: s_and_b32 s2, s0, s4 -; GFX7-NEXT: s_lshl_b32 s3, s3, 8 +; GFX7-NEXT: s_and_b32 s3, s0, s2 +; GFX7-NEXT: s_lshl_b32 s4, s4, 8 ; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX7-NEXT: s_or_b32 s2, s2, s3 +; GFX7-NEXT: s_or_b32 s3, s3, s4 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 -; GFX7-NEXT: s_or_b32 s0, s2, s0 +; GFX7-NEXT: s_or_b32 s0, s3, s0 ; GFX7-NEXT: s_lshl_b32 s1, s1, 24 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshl_b32_e32 v1, s2, v1 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s2, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll @@ -39,8 +39,8 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX8-NEXT: s_trap 2 -; GFX8-NEXT: flat_store_dword v[0:1], v0 -; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dword v[0:1], v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: func_use_lds_global_constexpr_cast: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll @@ -11,16 +11,16 @@ ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 4 ; CHECK-NEXT: s_mov_b32 m0, -1 -; CHECK-NEXT: ds_read_b32 v3, v0 -; CHECK-NEXT: v_mov_b32_e32 v2, 9 +; CHECK-NEXT: ds_read_b32 v2, v0 +; CHECK-NEXT: v_mov_b32_e32 v3, 9 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_add_u32 s0, s0, 4 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-NEXT: flat_store_dword v[0:1], v3 +; CHECK-NEXT: flat_store_dword v[0:1], v2 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x200 -; CHECK-NEXT: ds_write_b32 v0, v2 +; CHECK-NEXT: ds_write_b32 v0, v3 ; CHECK-NEXT: s_endpgm entry: %tmp0 = getelementptr [128 x i32], [128 x i32] addrspace(3)* @lds_512_4, i32 0, i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll @@ -19,8 +19,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 { ; CI-LABEL: lds_atomic_dec_ret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -34,8 +34,8 @@ ; ; VI-LABEL: lds_atomic_dec_ret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -67,8 +67,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 { ; CI-LABEL: lds_atomic_dec_ret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -82,8 +82,8 @@ ; ; VI-LABEL: lds_atomic_dec_ret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -366,42 +366,42 @@ ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: v_add_i32_e32 v3, vcc, v0, v2 -; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v3 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; CI-NEXT: v_mov_b32_e32 v4, 42 -; CI-NEXT: flat_atomic_dec v2, v[2:3], v4 glc ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: flat_store_dword v[0:1], v3 ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_dec_ret_i32_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v3, vcc, v0, v2 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v3 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; VI-NEXT: v_mov_b32_e32 v4, 42 -; VI-NEXT: flat_atomic_dec v2, v[2:3], v4 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: s_endpgm ; GFX9-LABEL: global_atomic_dec_ret_i32_offset_addr64: ; GFX9: ; %bb.0: @@ -669,42 +669,42 @@ ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: v_add_i32_e32 v3, vcc, v0, v2 -; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v3 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; CI-NEXT: v_mov_b32_e32 v4, 42 -; CI-NEXT: flat_atomic_dec v2, v[2:3], v4 glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: flat_store_dword v[0:1], v3 ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v3, vcc, v0, v2 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v3 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; VI-NEXT: v_mov_b32_e32 v4, 42 -; VI-NEXT: flat_atomic_dec v2, v[2:3], v4 glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: s_endpgm ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; GFX9: ; %bb.0: @@ -983,45 +983,45 @@ ; CI-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: v_add_i32_e32 v4, vcc, v0, v2 -; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v2, 42 -; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v4 -; CI-NEXT: v_mov_b32_e32 v3, 0 -; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc +; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v4 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v4, vcc, v0, v2 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, 42 -; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v4 -; VI-NEXT: v_mov_b32_e32 v3, 0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc +; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; GFX9-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; GFX9: ; %bb.0: @@ -1122,35 +1122,35 @@ ; CI-LABEL: atomic_dec_shl_base_lds_0: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_add_i32_e32 v2, vcc, 2, v0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; CI-NEXT: v_mov_b32_e32 v1, 9 +; CI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; CI-NEXT: v_mov_b32_e32 v2, 9 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: ds_dec_rtn_u32 v3, v0, v1 offset:8 +; CI-NEXT: ds_dec_rtn_u32 v2, v1, v2 offset:8 +; CI-NEXT: v_add_i32_e32 v3, vcc, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: flat_store_dword v[0:1], v3 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_store_dword v[0:1], v3 +; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_dec_shl_base_lds_0: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: v_mov_b32_e32 v1, 9 +; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; VI-NEXT: v_mov_b32_e32 v2, 9 ; VI-NEXT: s_mov_b32 m0, -1 -; VI-NEXT: ds_dec_rtn_u32 v3, v0, v1 offset:8 +; VI-NEXT: ds_dec_rtn_u32 v2, v1, v2 offset:8 +; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_store_dword v[0:1], v3 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; GFX9-LABEL: atomic_dec_shl_base_lds_0: ; GFX9: ; %bb.0: @@ -1181,8 +1181,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 { ; CI-LABEL: lds_atomic_dec_ret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -1197,8 +1197,8 @@ ; ; VI-LABEL: lds_atomic_dec_ret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 @@ -1232,8 +1232,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 { ; CI-LABEL: lds_atomic_dec_ret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -1248,8 +1248,8 @@ ; ; VI-LABEL: lds_atomic_dec_ret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 @@ -1542,45 +1542,45 @@ ; CI-LABEL: global_atomic_dec_ret_i64_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: v_add_i32_e32 v4, vcc, v0, v2 -; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v2, 42 -; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v4 -; CI-NEXT: v_mov_b32_e32 v3, 0 -; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc +; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v4 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_dec_ret_i64_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v4, vcc, v0, v2 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, 42 -; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v4 -; VI-NEXT: v_mov_b32_e32 v3, 0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc +; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; GFX9-LABEL: global_atomic_dec_ret_i64_offset_addr64: ; GFX9: ; %bb.0: @@ -1681,37 +1681,37 @@ ; CI-LABEL: atomic_dec_shl_base_lds_0_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_add_i32_e32 v4, vcc, 2, v0 -; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; CI-NEXT: v_mov_b32_e32 v0, 9 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v1, 9 +; CI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; CI-NEXT: v_mov_b32_e32 v2, 0 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:16 +; CI-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: flat_store_dword v[2:3], v4 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: v_mov_b32_e32 v4, s3 +; CI-NEXT: v_add_i32_e32 v0, vcc, 2, v0 +; CI-NEXT: v_mov_b32_e32 v3, s2 +; CI-NEXT: flat_store_dword v[3:4], v0 +; CI-NEXT: v_mov_b32_e32 v4, s1 +; CI-NEXT: v_mov_b32_e32 v3, s0 +; CI-NEXT: flat_store_dwordx2 v[3:4], v[1:2] ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_dec_shl_base_lds_0_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 -; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: v_mov_b32_e32 v0, 9 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v1, 9 +; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_mov_b32 m0, -1 -; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:16 +; VI-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: flat_store_dword v[2:3], v4 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 +; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: flat_store_dword v[3:4], v0 +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2] ; VI-NEXT: s_endpgm ; GFX9-LABEL: atomic_dec_shl_base_lds_0_i64: ; GFX9: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll @@ -21,8 +21,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 { ; CI-LABEL: lds_atomic_inc_ret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -36,8 +36,8 @@ ; ; VI-LABEL: lds_atomic_inc_ret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -51,8 +51,8 @@ ; ; GFX9-LABEL: lds_atomic_inc_ret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -86,8 +86,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 { ; CI-LABEL: lds_atomic_inc_ret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -101,8 +101,8 @@ ; ; VI-LABEL: lds_atomic_inc_ret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -116,8 +116,8 @@ ; ; GFX9-LABEL: lds_atomic_inc_ret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -439,42 +439,42 @@ ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: v_add_i32_e32 v3, vcc, v0, v2 -; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v3 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; CI-NEXT: v_mov_b32_e32 v4, 42 -; CI-NEXT: flat_atomic_inc v2, v[2:3], v4 glc ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: flat_store_dword v[0:1], v3 ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_inc_ret_i32_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v3, vcc, v0, v2 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v3 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; VI-NEXT: v_mov_b32_e32 v4, 42 -; VI-NEXT: flat_atomic_inc v2, v[2:3], v4 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_inc_ret_i32_offset_addr64: @@ -568,48 +568,48 @@ ; CI-LABEL: atomic_inc_shl_base_lds_0_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_add_i32_e32 v2, vcc, 2, v0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; CI-NEXT: v_mov_b32_e32 v1, 9 +; CI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; CI-NEXT: v_mov_b32_e32 v2, 9 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: ds_inc_rtn_u32 v3, v0, v1 offset:8 +; CI-NEXT: ds_inc_rtn_u32 v2, v1, v2 offset:8 +; CI-NEXT: v_add_i32_e32 v3, vcc, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: flat_store_dword v[0:1], v3 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_store_dword v[0:1], v3 +; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_inc_shl_base_lds_0_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: v_mov_b32_e32 v1, 9 +; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; VI-NEXT: v_mov_b32_e32 v2, 9 ; VI-NEXT: s_mov_b32 m0, -1 -; VI-NEXT: ds_inc_rtn_u32 v3, v0, v1 offset:8 +; VI-NEXT: ds_inc_rtn_u32 v2, v1, v2 offset:8 +; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_store_dword v[0:1], v3 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_inc_shl_base_lds_0_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 9 -; GFX9-NEXT: ds_inc_rtn_u32 v0, v0, v2 offset:8 +; GFX9-NEXT: ds_inc_rtn_u32 v1, v1, v2 offset:8 +; GFX9-NEXT: v_add_u32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v2, v1, s[2:3] -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[2:3] +; GFX9-NEXT: global_store_dword v2, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: atomic_inc_shl_base_lds_0_i32: @@ -636,8 +636,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 { ; CI-LABEL: lds_atomic_inc_ret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -652,8 +652,8 @@ ; ; VI-LABEL: lds_atomic_inc_ret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 @@ -668,8 +668,8 @@ ; ; GFX9-LABEL: lds_atomic_inc_ret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -701,8 +701,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 { ; CI-LABEL: lds_atomic_inc_ret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -717,8 +717,8 @@ ; ; VI-LABEL: lds_atomic_inc_ret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 @@ -733,8 +733,8 @@ ; ; GFX9-LABEL: lds_atomic_inc_ret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1081,45 +1081,45 @@ ; CI-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: v_add_i32_e32 v4, vcc, v0, v2 -; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v2, 42 -; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v4 -; CI-NEXT: v_mov_b32_e32 v3, 0 -; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc +; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v4 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v4, vcc, v0, v2 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, 42 -; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v4 -; VI-NEXT: v_mov_b32_e32 v3, 0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc +; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_inc_ret_i64_offset_addr64: @@ -1370,61 +1370,61 @@ ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: v_add_i32_e32 v3, vcc, v0, v2 -; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v3 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; CI-NEXT: v_mov_b32_e32 v4, 42 -; CI-NEXT: flat_atomic_inc v2, v[2:3], v4 glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: flat_store_dword v[0:1], v3 ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v3, vcc, v0, v2 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v3 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; VI-NEXT: v_mov_b32_e32 v4, 42 -; VI-NEXT: flat_atomic_inc v2, v[2:3], v4 glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_inc v3, v[0:1], v3 offset:20 glc +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9-NEXT: v_mov_b32_e32 v4, 42 -; GFX9-NEXT: flat_atomic_inc v0, v[0:1], v4 offset:20 glc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_store_dword v[2:3], v0 +; GFX9-NEXT: flat_store_dword v[0:1], v3 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_addr64: @@ -1527,51 +1527,51 @@ ; CI-LABEL: atomic_inc_shl_base_lds_0_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_add_i32_e32 v4, vcc, 2, v0 -; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; CI-NEXT: v_mov_b32_e32 v0, 9 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v1, 9 +; CI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; CI-NEXT: v_mov_b32_e32 v2, 0 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:16 +; CI-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: flat_store_dword v[2:3], v4 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: v_mov_b32_e32 v4, s3 +; CI-NEXT: v_add_i32_e32 v0, vcc, 2, v0 +; CI-NEXT: v_mov_b32_e32 v3, s2 +; CI-NEXT: flat_store_dword v[3:4], v0 +; CI-NEXT: v_mov_b32_e32 v4, s1 +; CI-NEXT: v_mov_b32_e32 v3, s0 +; CI-NEXT: flat_store_dwordx2 v[3:4], v[1:2] ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_inc_shl_base_lds_0_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 -; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: v_mov_b32_e32 v0, 9 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v1, 9 +; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_mov_b32 m0, -1 -; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:16 +; VI-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: flat_store_dword v[2:3], v4 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 +; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: flat_store_dword v[3:4], v0 +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_inc_shl_base_lds_0_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v1, 9 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_add_u32_e32 v3, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v0, v[1:2] offset:16 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX9-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 +; GFX9-NEXT: v_add_u32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v2, v3, s[2:3] -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dword v3, v0, s[2:3] +; GFX9-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: atomic_inc_shl_base_lds_0_i64: @@ -1762,62 +1762,62 @@ ; CI-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: v_add_i32_e32 v4, vcc, v0, v2 -; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v2, 42 -; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v4 -; CI-NEXT: v_mov_b32_e32 v3, 0 -; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc +; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v4 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v4, vcc, v0, v2 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, 42 -; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v4 -; VI-NEXT: v_mov_b32_e32 v3, 0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc +; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 3, v0 -; GFX9-NEXT: v_mov_b32_e32 v4, 42 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[4:5] offset:40 glc +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[3:4], v[1:2] offset:40 glc ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1925,17 +1925,17 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(3)* %ptr) #0 { ; CI-LABEL: nocse_lds_atomic_inc_ret_i32: ; CI: ; %bb.0: +; CI-NEXT: s_load_dword s6, s[4:5], 0x4 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v1, s4 +; CI-NEXT: v_mov_b32_e32 v1, s6 ; CI-NEXT: ds_inc_rtn_u32 v4, v1, v0 ; CI-NEXT: ds_inc_rtn_u32 v5, v1, v0 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: s_waitcnt lgkmcnt(1) ; CI-NEXT: flat_store_dword v[0:1], v4 @@ -1945,17 +1945,17 @@ ; ; VI-LABEL: nocse_lds_atomic_inc_ret_i32: ; VI: ; %bb.0: +; VI-NEXT: s_load_dword s6, s[4:5], 0x10 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: ds_inc_rtn_u32 v4, v1, v0 ; VI-NEXT: ds_inc_rtn_u32 v5, v1, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: s_waitcnt lgkmcnt(1) ; VI-NEXT: flat_store_dword v[0:1], v4 @@ -1965,8 +1965,8 @@ ; ; GFX9-LABEL: nocse_lds_atomic_inc_ret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s6 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -211,22 +211,22 @@ define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) { ; GFX7-LABEL: test_div_fmas_f32: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX7-NEXT: s_load_dword s2, s[0:1], 0x13 ; GFX7-NEXT: s_load_dword s3, s[0:1], 0x1c -; GFX7-NEXT: s_load_dword s6, s[0:1], 0x25 -; GFX7-NEXT: s_load_dword s0, s[0:1], 0x2e -; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x25 +; GFX7-NEXT: s_load_dword s5, s[0:1], 0x2e +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: s_and_b32 s0, 1, s0 -; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: s_nop 2 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_and_b32 s2, 1, s5 +; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_nop 1 ; GFX7-NEXT: v_div_fmas_f32 v0, v0, v1, v2 -; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_fmas_f32: @@ -292,20 +292,20 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) { ; GFX7-LABEL: test_div_fmas_f32_inline_imm_0: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX7-NEXT: s_load_dword s2, s[0:1], 0x1c ; GFX7-NEXT: s_load_dword s3, s[0:1], 0x25 -; GFX7-NEXT: s_load_dword s0, s[0:1], 0x2e -; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x2e +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: s_and_b32 s0, 1, s0 -; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX7-NEXT: s_nop 3 +; GFX7-NEXT: s_and_b32 s2, 1, s4 +; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_nop 1 ; GFX7-NEXT: v_div_fmas_f32 v0, 1.0, v0, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_fmas_f32_inline_imm_0: @@ -365,20 +365,20 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, [8 x i32], i1 %d) { ; GFX7-LABEL: test_div_fmas_f32_inline_imm_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX7-NEXT: s_load_dword s2, s[0:1], 0xb ; GFX7-NEXT: s_load_dword s3, s[0:1], 0xd -; GFX7-NEXT: s_load_dword s0, s[0:1], 0x16 -; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x16 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: s_and_b32 s0, 1, s0 -; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX7-NEXT: s_nop 3 +; GFX7-NEXT: s_and_b32 s2, 1, s4 +; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_nop 1 ; GFX7-NEXT: v_div_fmas_f32 v0, v0, 1.0, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_fmas_f32_inline_imm_1: @@ -438,20 +438,20 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) { ; GFX7-LABEL: test_div_fmas_f32_inline_imm_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX7-NEXT: s_load_dword s2, s[0:1], 0x13 ; GFX7-NEXT: s_load_dword s3, s[0:1], 0x1c -; GFX7-NEXT: s_load_dword s0, s[0:1], 0x2e -; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x2e +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: s_and_b32 s0, 1, s0 -; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX7-NEXT: s_nop 3 +; GFX7-NEXT: s_and_b32 s2, 1, s4 +; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_nop 1 ; GFX7-NEXT: v_div_fmas_f32 v0, v0, v1, 1.0 -; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_fmas_f32_inline_imm_2: @@ -511,41 +511,41 @@ define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) { ; GFX7-LABEL: test_div_fmas_f64: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s8, s[0:1], 0x11 -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x11 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-NEXT: s_and_b32 s2, 1, s8 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: v_mov_b32_e32 v4, s10 +; GFX7-NEXT: s_and_b32 s0, 1, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s9 +; GFX7-NEXT: v_mov_b32_e32 v5, s11 +; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX7-NEXT: s_nop 3 ; GFX7-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_fmas_f64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44 -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x44 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NEXT: s_and_b32 s2, 1, s8 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NEXT: s_and_b32 s0, 1, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s9 +; GFX8-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX8-NEXT: s_nop 3 ; GFX8-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -590,21 +590,21 @@ define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) { ; GFX7-LABEL: test_div_fmas_f32_cond_to_vcc: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb -; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_cmp_eq_u32 s3, 0 -; GFX7-NEXT: s_cselect_b32 s3, 1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: s_and_b32 s0, 1, s3 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX7-NEXT: s_nop 3 +; GFX7-NEXT: s_cmp_eq_u32 s7, 0 +; GFX7-NEXT: s_cselect_b32 s2, 1, 0 +; GFX7-NEXT: s_and_b32 s2, 1, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_nop 2 ; GFX7-NEXT: v_div_fmas_f32 v0, v0, v1, v2 -; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_fmas_f32_cond_to_vcc: @@ -668,19 +668,19 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c) { ; GFX7-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX7-NEXT: s_load_dword s2, s[0:1], 0x13 ; GFX7-NEXT: s_load_dword s3, s[0:1], 0x1c -; GFX7-NEXT: s_load_dword s0, s[0:1], 0x25 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x25 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b64 vcc, 0 -; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_div_fmas_f32 v0, v0, v1, v2 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc: @@ -739,19 +739,19 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c) { ; GFX7-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX7-NEXT: s_load_dword s2, s[0:1], 0x13 ; GFX7-NEXT: s_load_dword s3, s[0:1], 0x1c -; GFX7-NEXT: s_load_dword s0, s[0:1], 0x25 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x25 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b64 vcc, -1 -; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_div_fmas_f32 v0, v0, v1, v2 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc: @@ -938,14 +938,14 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, [8 x i32], float addrspace(1)* %in, [8 x i32], i32 addrspace(1)* %dummy) { ; GFX7-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x13 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_dwordx3 v[1:3], v[1:2], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dwordx3 v[1:3], v[1:2], s[4:7], 0 addr64 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7-NEXT: s_and_saveexec_b64 s[2:3], vcc @@ -961,26 +961,26 @@ ; GFX7-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7-NEXT: s_and_b32 s0, 1, s6 ; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX7-NEXT: s_mov_b32 s10, -1 -; GFX7-NEXT: s_mov_b64 s[6:7], s[10:11] +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: s_nop 0 +; GFX7-NEXT: s_nop 1 ; GFX7-NEXT: v_div_fmas_f32 v0, v1, v2, v3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX8-NEXT: s_mov_b32 s6, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GFX8-NEXT: flat_load_dwordx3 v[1:3], v[1:2] +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_cbranch_execz .LBB13_2 @@ -993,6 +993,7 @@ ; GFX8-NEXT: s_cselect_b32 s6, 1, 0 ; GFX8-NEXT: .LBB13_2: ; %exit ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s2, 8 ; GFX8-NEXT: s_addc_u32 s1, s3, 0 ; GFX8-NEXT: s_and_b32 s2, 1, s6 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll @@ -765,25 +765,25 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b) { ; GFX7-LABEL: test_div_scale_f32_all_scalar_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX7-NEXT: s_load_dword s2, s[0:1], 0x13 -; GFX7-NEXT: s_load_dword s0, s[0:1], 0x1c -; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_load_dword s3, s[0:1], 0x1c +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x13 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_div_scale_f32 v0, s[0:1], v0, v0, s2 -; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_div_scale_f32 v0, s[4:5], v0, v0, s4 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_scale_f32_all_scalar_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x4c -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x70 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x70 +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x4c ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -809,25 +809,25 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b) { ; GFX7-LABEL: test_div_scale_f32_all_scalar_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX7-NEXT: s_load_dword s2, s[0:1], 0x13 -; GFX7-NEXT: s_load_dword s0, s[0:1], 0x1c -; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_load_dword s3, s[0:1], 0x1c +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x13 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_div_scale_f32 v0, s[0:1], s2, v0, s2 -; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_div_scale_f32 v0, s[4:5], s4, v0, s4 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_scale_f32_all_scalar_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x4c -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x70 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x70 +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x4c ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s2, v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s3, v0, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -853,13 +853,13 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, [8 x i32], double %a, [8 x i32], double %b) { ; GFX7-LABEL: test_div_scale_f64_all_scalar_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x1d +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x1d +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -867,13 +867,13 @@ ; ; GFX8-LABEL: test_div_scale_f64_all_scalar_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x74 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -899,13 +899,13 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, [8 x i32], double %a, [8 x i32], double %b) { ; GFX7-LABEL: test_div_scale_f64_all_scalar_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x1d +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x1d +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[2:3], v[0:1], s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[4:5], v[0:1], s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -913,13 +913,13 @@ ; ; GFX8-LABEL: test_div_scale_f64_all_scalar_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x74 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[2:3], v[0:1], s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[4:5], v[0:1], s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1084,14 +1084,14 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[0:1] glc +; GFX8-NEXT: flat_load_dword v2, v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: flat_load_dword v1, v[2:3] glc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0x7fffffff, v2 +; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll @@ -61,22 +61,22 @@ define amdgpu_kernel void @global_atomic_fadd_f32_off_ss(float addrspace(1)* %ptr, float %data) { ; GFX908-LABEL: global_atomic_fadd_f32_off_ss: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX908-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v0, s0 ; GFX908-NEXT: v_mov_b32_e32 v2, s2 +; GFX908-NEXT: v_mov_b32_e32 v0, s0 ; GFX908-NEXT: v_mov_b32_e32 v1, s1 ; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2048 ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: global_atomic_fadd_f32_off_ss: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2048 glc ; GFX90A-NEXT: s_endpgm %gep = getelementptr float, float addrspace(1)* %ptr, i64 512 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll @@ -14,13 +14,12 @@ ; GCN-LABEL: test_mfma_f32_32x32x4bf16_1k: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 -; GCN-NEXT: s_mov_b64 s[16:17], 1 -; GCN-NEXT: s_mov_b32 s18, 2 -; GCN-NEXT: s_mov_b32 s19, s17 -; GCN-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GCN-NEXT: s_mov_b64 s[36:37], 1 +; GCN-NEXT: v_pk_mov_b32 v[0:1], s[36:37], s[36:37] op_sel:[0,1] +; GCN-NEXT: s_mov_b32 s36, 2 +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[36:37], s[36:37] op_sel:[0,1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0 -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[18:19], s[18:19] op_sel:[0,1] ; GCN-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v4, s0 @@ -114,14 +113,13 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(<16 x float> addrspace(1)* %arg) { ; GCN-LABEL: test_mfma_f32_16x16x4bf16_1k: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x24 -; GCN-NEXT: s_mov_b64 s[2:3], 1 -; GCN-NEXT: s_mov_b32 s17, s3 -; GCN-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GCN-NEXT: s_mov_b32 s16, 2 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 +; GCN-NEXT: s_mov_b64 s[18:19], 1 +; GCN-NEXT: v_pk_mov_b32 v[0:1], s[18:19], s[18:19] op_sel:[0,1] +; GCN-NEXT: s_mov_b32 s18, 2 +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[18:19], s[18:19] op_sel:[0,1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] +; GCN-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: v_accvgpr_write_b32 a0, v4 @@ -160,10 +158,10 @@ ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[18:19] -; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[18:19] offset:16 -; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[18:19] offset:32 -; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[18:19] offset:48 +; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 +; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 +; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GCN-NEXT: s_endpgm bb: %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg @@ -177,14 +175,13 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(<4 x float> addrspace(1)* %arg) { ; GCN-LABEL: test_mfma_f32_4x4x4bf16_1k: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 -; GCN-NEXT: s_mov_b64 s[2:3], 1 -; GCN-NEXT: s_mov_b32 s5, s3 -; GCN-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GCN-NEXT: s_mov_b32 s4, 2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-NEXT: s_mov_b64 s[6:7], 1 +; GCN-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GCN-NEXT: s_mov_b32 s6, 2 +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: v_accvgpr_write_b32 a0, v4 @@ -198,7 +195,7 @@ ; GCN-NEXT: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] ; GCN-NEXT: s_endpgm bb: %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg @@ -212,14 +209,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(<16 x float> addrspace(1)* %arg) { ; GCN-LABEL: test_mfma_f32_32x32x8bf16_1k: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x24 -; GCN-NEXT: s_mov_b64 s[2:3], 1 -; GCN-NEXT: s_mov_b32 s17, s3 -; GCN-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GCN-NEXT: s_mov_b32 s16, 2 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 +; GCN-NEXT: s_mov_b64 s[18:19], 1 +; GCN-NEXT: v_pk_mov_b32 v[0:1], s[18:19], s[18:19] op_sel:[0,1] +; GCN-NEXT: s_mov_b32 s18, 2 +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[18:19], s[18:19] op_sel:[0,1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] +; GCN-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: v_accvgpr_write_b32 a0, v4 @@ -259,10 +255,10 @@ ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[18:19] -; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[18:19] offset:16 -; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[18:19] offset:32 -; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[18:19] offset:48 +; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 +; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 +; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GCN-NEXT: s_endpgm bb: %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg @@ -276,14 +272,13 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(<4 x float> addrspace(1)* %arg) { ; GCN-LABEL: test_mfma_f32_16x16x16bf16_1k: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 -; GCN-NEXT: s_mov_b64 s[2:3], 1 -; GCN-NEXT: s_mov_b32 s5, s3 -; GCN-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GCN-NEXT: s_mov_b32 s4, 2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-NEXT: s_mov_b64 s[6:7], 1 +; GCN-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GCN-NEXT: s_mov_b32 s6, 2 +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: v_accvgpr_write_b32 a0, v4 @@ -298,7 +293,7 @@ ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] ; GCN-NEXT: s_endpgm bb: %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg @@ -402,10 +397,9 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(<4 x double> addrspace(1)* %arg, double %a, double %b) { ; GCN-LABEL: test_mfma_f64_16x16x4f64_imm: ; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x24 ; GCN-NEXT: s_mov_b64 s[4:5], 0 ; GCN-NEXT: s_mov_b64 s[10:11], 1.0 -; GCN-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x24 -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GCN-NEXT: s_mov_b64 s[6:7], s[4:5] ; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] ; GCN-NEXT: v_mov_b32_e32 v4, s4 @@ -413,6 +407,7 @@ ; GCN-NEXT: v_mov_b32_e32 v4, s5 ; GCN-NEXT: v_accvgpr_write_b32 a1, v4 ; GCN-NEXT: v_mov_b32_e32 v4, s6 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN-NEXT: v_accvgpr_write_b32 a2, v4 ; GCN-NEXT: v_mov_b32_e32 v4, s7 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -426,7 +421,7 @@ ; GCN-NEXT: v_accvgpr_write_b32 a6, v4 ; GCN-NEXT: v_mov_b32_e32 v4, s11 ; GCN-NEXT: v_accvgpr_write_b32 a7, v4 -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] ; GCN-NEXT: v_mov_b32_e32 v0, 0 @@ -445,11 +440,10 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(<4 x double> addrspace(1)* %arg, double %a, double %b) { ; GCN-LABEL: test_mfma_f64_16x16x4f64_splat_lit: ; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0 ; GCN-NEXT: s_mov_b32 s5, 0x405ec000 ; GCN-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x24 -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] ; GCN-NEXT: s_mov_b64 s[10:11], s[4:5] ; GCN-NEXT: v_mov_b32_e32 v4, s4 @@ -457,6 +451,7 @@ ; GCN-NEXT: v_mov_b32_e32 v4, s5 ; GCN-NEXT: v_accvgpr_write_b32 a1, v4 ; GCN-NEXT: v_mov_b32_e32 v4, s6 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN-NEXT: v_accvgpr_write_b32 a2, v4 ; GCN-NEXT: v_mov_b32_e32 v4, s7 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -470,7 +465,7 @@ ; GCN-NEXT: v_accvgpr_write_b32 a6, v4 ; GCN-NEXT: v_mov_b32_e32 v4, s11 ; GCN-NEXT: v_accvgpr_write_b32 a7, v4 -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] ; GCN-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll @@ -7,14 +7,14 @@ define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in) { ; GFX8-LABEL: dpp_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll @@ -44,17 +44,17 @@ define amdgpu_kernel void @bfe_i32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { ; GFX6-LABEL: bfe_i32_arg_arg_imm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb -; GFX6-NEXT: s_load_dword s0, s[0:1], 0xc -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dword s3, s[0:1], 0xc +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s0, s0, 63 -; GFX6-NEXT: s_or_b32 s0, s0, 0x7b0000 -; GFX6-NEXT: s_bfe_i32 s0, s2, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_and_b32 s3, s3, 63 +; GFX6-NEXT: s_or_b32 s3, s3, 0x7b0000 +; GFX6-NEXT: s_bfe_i32 s3, s4, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 123) store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 @@ -64,17 +64,17 @@ define amdgpu_kernel void @bfe_i32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #0 { ; GFX6-LABEL: bfe_i32_arg_imm_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb -; GFX6-NEXT: s_load_dword s0, s[0:1], 0xc -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dword s3, s[0:1], 0xc +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_or_b32 s0, 59, s0 -; GFX6-NEXT: s_bfe_i32 s0, s2, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_or_b32 s3, 59, s3 +; GFX6-NEXT: s_bfe_i32 s3, s4, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 123, i32 %src2) store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 @@ -84,18 +84,18 @@ define amdgpu_kernel void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #0 { ; GFX6-LABEL: bfe_i32_imm_arg_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb -; GFX6-NEXT: s_load_dword s0, s[0:1], 0xc -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dword s3, s[0:1], 0xb +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xc +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s1, s2, 63 -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_or_b32 s0, s1, s0 -; GFX6-NEXT: s_bfe_i32 s0, 0x7b, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_and_b32 s3, s3, 63 +; GFX6-NEXT: s_lshl_b32 s4, s4, 16 +; GFX6-NEXT: s_or_b32 s3, s3, s4 +; GFX6-NEXT: s_bfe_i32 s3, 0x7b, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 123, i32 %src1, i32 %src2) store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 @@ -105,16 +105,16 @@ define amdgpu_kernel void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) #0 { ; GFX6-LABEL: v_bfe_print_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x80002 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_bfe_i32 s3, s3, 0x80002 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %load = load i32, i32 addrspace(1)* %src0, align 4 %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 2, i32 8) @@ -125,16 +125,16 @@ define amdgpu_kernel void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { ; GFX6-LABEL: bfe_i32_arg_0_width_reg_offset: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb -; GFX6-NEXT: s_load_dword s0, s[0:1], 0xc -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dword s3, s[0:1], 0xc +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s0, s0, 63 -; GFX6-NEXT: s_bfe_i32 s0, s2, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_and_b32 s3, s3, 63 +; GFX6-NEXT: s_bfe_i32 s3, s4, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 0) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 @@ -144,14 +144,14 @@ define amdgpu_kernel void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { ; GFX6-LABEL: bfe_i32_arg_0_width_imm_offset: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dword s3, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s0, s0, 8 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_bfe_i32 s3, s3, 8 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 8, i32 0) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 @@ -161,17 +161,17 @@ define amdgpu_kernel void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_i32_test_6: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s0, s0, 31 -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x1f0001 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_lshl_b32 s3, s3, 31 +; GFX6-NEXT: s_bfe_i32 s3, s3, 0x1f0001 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 @@ -183,17 +183,17 @@ define amdgpu_kernel void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_i32_test_7: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s0, s0, 31 -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x1f0000 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_lshl_b32 s3, s3, 31 +; GFX6-NEXT: s_bfe_i32 s3, s3, 0x1f0000 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 @@ -205,17 +205,17 @@ define amdgpu_kernel void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_i32_test_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s0, s0, 31 -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x1001f -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_lshl_b32 s3, s3, 31 +; GFX6-NEXT: s_bfe_i32 s3, s3, 0x1001f +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 @@ -227,16 +227,16 @@ define amdgpu_kernel void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_i32_test_9: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x1001f -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_bfe_i32 s3, s3, 0x1001f +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 31, i32 1) @@ -247,16 +247,16 @@ define amdgpu_kernel void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_i32_test_10: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x1f0001 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_bfe_i32 s3, s3, 0x1f0001 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 1, i32 31) @@ -267,16 +267,16 @@ define amdgpu_kernel void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_i32_test_11: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x180008 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_bfe_i32 s3, s3, 0x180008 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 8, i32 24) @@ -287,16 +287,16 @@ define amdgpu_kernel void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_i32_test_12: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x80018 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_bfe_i32 s3, s3, 0x80018 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 24, i32 8) @@ -307,17 +307,17 @@ define amdgpu_kernel void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_i32_test_13: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s0, s0, 31 -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x1001f -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_ashr_i32 s3, s3, 31 +; GFX6-NEXT: s_bfe_i32 s3, s3, 0x1001f +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %shl = ashr i32 %x, 31 @@ -328,17 +328,17 @@ define amdgpu_kernel void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_i32_test_14: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s0, s0, 31 -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x1001f -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_lshr_b32 s3, s3, 31 +; GFX6-NEXT: s_bfe_i32 s3, s3, 0x1001f +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %shl = lshr i32 %x, 31 @@ -666,17 +666,17 @@ define amdgpu_kernel void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_sext_in_reg_i24: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x180000 -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x180000 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_bfe_i32 s3, s3, 0x180000 +; GFX6-NEXT: s_bfe_i32 s3, s3, 0x180000 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 0, i32 24) @@ -731,16 +731,16 @@ define amdgpu_kernel void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { ; GFX6-LABEL: bfe_0_width: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s0, s0, 8 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_bfe_i32 s3, s3, 8 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %load = load i32, i32 addrspace(1)* %ptr, align 4 %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 8, i32 0) @@ -751,18 +751,18 @@ define amdgpu_kernel void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { ; GFX6-LABEL: bfe_8_bfe_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s4, 0x80000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX6-NEXT: s_mov_b32 s1, 0x80000 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s0, s0, s1 -; GFX6-NEXT: s_bfe_i32 s0, s0, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_bfe_i32 s3, s3, s4 +; GFX6-NEXT: s_bfe_i32 s3, s3, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %load = load i32, i32 addrspace(1)* %ptr, align 4 %bfe0 = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 0, i32 8) @@ -774,17 +774,17 @@ define amdgpu_kernel void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { ; GFX6-LABEL: bfe_8_bfe_16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x80000 -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x100000 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_bfe_i32 s3, s3, 0x80000 +; GFX6-NEXT: s_bfe_i32 s3, s3, 0x100000 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %load = load i32, i32 addrspace(1)* %ptr, align 4 %bfe0 = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 0, i32 8) @@ -797,17 +797,17 @@ define amdgpu_kernel void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { ; GFX6-LABEL: bfe_16_bfe_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x100000 -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x80000 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_bfe_i32 s3, s3, 0x100000 +; GFX6-NEXT: s_bfe_i32 s3, s3, 0x80000 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %load = load i32, i32 addrspace(1)* %ptr, align 4 %bfe0 = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 0, i32 16) @@ -820,17 +820,17 @@ define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { ; GFX6-LABEL: sext_in_reg_i8_to_i32_bfe: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb -; GFX6-NEXT: s_load_dword s0, s[0:1], 0xc -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dword s3, s[0:1], 0xb +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xc +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_add_i32 s2, s2, s0 -; GFX6-NEXT: s_bfe_i32 s0, s2, 0x80000 -; GFX6-NEXT: s_sext_i32_i8 s0, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_add_i32 s3, s3, s4 +; GFX6-NEXT: s_bfe_i32 s3, s3, 0x80000 +; GFX6-NEXT: s_sext_i32_i8 s3, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %c = add i32 %a, %b ; add to prevent folding into extload %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %c, i32 0, i32 8) @@ -843,17 +843,17 @@ define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { ; GFX6-LABEL: sext_in_reg_i8_to_i32_bfe_wrong: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb -; GFX6-NEXT: s_load_dword s0, s[0:1], 0xc -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dword s3, s[0:1], 0xb +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xc +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_add_i32 s2, s2, s0 -; GFX6-NEXT: s_bfe_i32 s0, s2, 8 -; GFX6-NEXT: s_sext_i32_i8 s0, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_add_i32 s3, s3, s4 +; GFX6-NEXT: s_bfe_i32 s3, s3, 8 +; GFX6-NEXT: s_sext_i32_i8 s3, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %c = add i32 %a, %b ; add to prevent folding into extload %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %c, i32 8, i32 0) @@ -866,16 +866,16 @@ define amdgpu_kernel void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) #0 { ; GFX6-LABEL: sextload_i8_to_i32_bfe: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_load_sbyte v0, off, s[4:7], 0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm %load = load i8, i8 addrspace(1)* %ptr, align 1 @@ -890,16 +890,16 @@ define amdgpu_kernel void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) #0 { ; GFX6-LABEL: sextload_i8_to_i32_bfe_0: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_load_sbyte v0, off, s[4:7], 0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_bfe_i32 v0, v0, 8, 0 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm %load = load i8, i8 addrspace(1)* %ptr, align 1 @@ -914,17 +914,17 @@ define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: sext_in_reg_i1_bfe_offset_0: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x10000 -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_bfe_i32 s3, s3, 0x10000 +; GFX6-NEXT: s_bfe_i32 s3, s3, 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 @@ -937,17 +937,17 @@ define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: sext_in_reg_i1_bfe_offset_1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x20000 -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x10001 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_bfe_i32 s3, s3, 0x20000 +; GFX6-NEXT: s_bfe_i32 s3, s3, 0x10001 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 30 @@ -960,17 +960,17 @@ define amdgpu_kernel void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: sext_in_reg_i2_bfe_offset_1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x20000 -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x20001 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_bfe_i32 s3, s3, 0x20000 +; GFX6-NEXT: s_bfe_i32 s3, s3, 0x20001 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 30 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll @@ -4,16 +4,16 @@ define amdgpu_kernel void @set_inactive(i32 addrspace(1)* %out, i32 %in) { ; GCN-LABEL: set_inactive: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 42 ; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0 store i32 %tmp, i32 addrspace(1)* %out @@ -43,38 +43,38 @@ define amdgpu_kernel void @set_inactive_scc(i32 addrspace(1)* %out, i32 %in, <4 x i32> inreg %desc) { ; GCN-LABEL: set_inactive_scc: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_buffer_load_dword s1, s[8:11], 0x0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_buffer_load_dword s3, s[4:7], 0x0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s1, 56 -; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s3, 56 +; GCN-NEXT: s_cselect_b32 s2, 1, 0 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 42 ; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 0 ; GCN-NEXT: s_cbranch_scc0 .LBB2_2 ; GCN-NEXT: ; %bb.1: ; %.one ; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_branch .LBB2_3 ; GCN-NEXT: .LBB2_2: -; GCN-NEXT: s_mov_b32 s0, -1 +; GCN-NEXT: s_mov_b32 s4, -1 ; GCN-NEXT: .LBB2_3: ; %Flow -; GCN-NEXT: s_xor_b32 s0, s0, -1 -; GCN-NEXT: s_and_b32 s0, s0, 1 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_xor_b32 s2, s4, -1 +; GCN-NEXT: s_and_b32 s2, s2, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB2_5 ; GCN-NEXT: ; %bb.4: ; %.zero -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: .LBB2_5: ; %.exit ; GCN-NEXT: s_endpgm %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll @@ -41,8 +41,8 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; CI-LABEL: s_trig_preop_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0 @@ -52,8 +52,8 @@ ; ; VI-LABEL: s_trig_preop_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0 @@ -63,14 +63,25 @@ ; ; GFX9-LABEL: s_trig_preop_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0 ; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: s_trig_preop_f64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_trig_preop_f64 v[0:1], s[0:1], s2 +; GFX10-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_endpgm %result = call double @llvm.amdgcn.trig.preop.f64(double %a, i32 %b) store volatile double %result, double* undef ret void @@ -85,6 +96,15 @@ ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm +; +; GFX10-LABEL: s_trig_preop_f64_imm: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 +; GFX10-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_endpgm %result = call double @llvm.amdgcn.trig.preop.f64(double %a, i32 7) store volatile double %result, double* undef ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll @@ -44,16 +44,16 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #0 { ; GFX6-LABEL: bfe_u32_arg_arg_arg: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xc ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb -; GFX6-NEXT: s_load_dword s0, s[0:1], 0xc +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s1, s0, 63 -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_or_b32 s0, s1, s0 -; GFX6-NEXT: s_bfe_u32 s0, s2, s0 +; GFX6-NEXT: s_and_b32 s1, s2, 63 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_or_b32 s1, s1, s2 +; GFX6-NEXT: s_bfe_u32 s0, s0, s1 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -65,17 +65,17 @@ define amdgpu_kernel void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { ; GFX6-LABEL: bfe_u32_arg_arg_imm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb -; GFX6-NEXT: s_load_dword s0, s[0:1], 0xc -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dword s3, s[0:1], 0xc +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s0, s0, 63 -; GFX6-NEXT: s_or_b32 s0, s0, 0x7b0000 -; GFX6-NEXT: s_bfe_u32 s0, s2, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_and_b32 s3, s3, 63 +; GFX6-NEXT: s_or_b32 s3, s3, 0x7b0000 +; GFX6-NEXT: s_bfe_u32 s3, s4, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 123) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 @@ -85,17 +85,17 @@ define amdgpu_kernel void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #0 { ; GFX6-LABEL: bfe_u32_arg_imm_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb -; GFX6-NEXT: s_load_dword s0, s[0:1], 0xc -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dword s3, s[0:1], 0xc +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_or_b32 s0, 59, s0 -; GFX6-NEXT: s_bfe_u32 s0, s2, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_or_b32 s3, 59, s3 +; GFX6-NEXT: s_bfe_u32 s3, s4, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 123, i32 %src2) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 @@ -105,18 +105,18 @@ define amdgpu_kernel void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #0 { ; GFX6-LABEL: bfe_u32_imm_arg_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb -; GFX6-NEXT: s_load_dword s0, s[0:1], 0xc -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dword s3, s[0:1], 0xb +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xc +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s1, s2, 63 -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_or_b32 s0, s1, s0 -; GFX6-NEXT: s_bfe_u32 s0, 0x7b, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_and_b32 s3, s3, 63 +; GFX6-NEXT: s_lshl_b32 s4, s4, 16 +; GFX6-NEXT: s_or_b32 s3, s3, s4 +; GFX6-NEXT: s_bfe_u32 s3, 0x7b, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 123, i32 %src1, i32 %src2) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 @@ -126,16 +126,16 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { ; GFX6-LABEL: bfe_u32_arg_0_width_reg_offset: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb -; GFX6-NEXT: s_load_dword s0, s[0:1], 0xc -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dword s3, s[0:1], 0xc +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s0, s0, 63 -; GFX6-NEXT: s_bfe_u32 s0, s2, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_and_b32 s3, s3, 63 +; GFX6-NEXT: s_bfe_u32 s3, s4, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 0) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 @@ -145,14 +145,14 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { ; GFX6-LABEL: bfe_u32_arg_0_width_imm_offset: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dword s3, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s0, s0, 8 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_bfe_u32 s3, s3, 8 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 8, i32 0) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 @@ -162,15 +162,15 @@ define amdgpu_kernel void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_zextload_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 8 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm %load = load i8, i8 addrspace(1)* %in @@ -184,18 +184,18 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_zext_in_reg_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_add_i32 s0, s0, 1 -; GFX6-NEXT: s_and_b32 s0, s0, 0xff -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x80000 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_add_i32 s3, s3, 1 +; GFX6-NEXT: s_and_b32 s3, s3, 0xff +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x80000 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %load = load i32, i32 addrspace(1)* %in, align 4 %add = add i32 %load, 1 @@ -208,18 +208,18 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_zext_in_reg_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_add_i32 s0, s0, 1 -; GFX6-NEXT: s_and_b32 s0, s0, 0xffff -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_add_i32 s3, s3, 1 +; GFX6-NEXT: s_and_b32 s3, s3, 0xffff +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %load = load i32, i32 addrspace(1)* %in, align 4 %add = add i32 %load, 1 @@ -232,18 +232,18 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_zext_in_reg_i8_offset_1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_add_i32 s0, s0, 1 -; GFX6-NEXT: s_and_b32 s0, s0, 0xff -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x80001 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_add_i32 s3, s3, 1 +; GFX6-NEXT: s_and_b32 s3, s3, 0xff +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x80001 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %load = load i32, i32 addrspace(1)* %in, align 4 %add = add i32 %load, 1 @@ -256,18 +256,18 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_zext_in_reg_i8_offset_3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_add_i32 s0, s0, 1 -; GFX6-NEXT: s_and_b32 s0, s0, 0xff -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x80003 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_add_i32 s3, s3, 1 +; GFX6-NEXT: s_and_b32 s3, s3, 0xff +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x80003 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %load = load i32, i32 addrspace(1)* %in, align 4 %add = add i32 %load, 1 @@ -280,18 +280,18 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_zext_in_reg_i8_offset_7: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_add_i32 s0, s0, 1 -; GFX6-NEXT: s_and_b32 s0, s0, 0xff -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x80007 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_add_i32 s3, s3, 1 +; GFX6-NEXT: s_and_b32 s3, s3, 0xff +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x80007 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %load = load i32, i32 addrspace(1)* %in, align 4 %add = add i32 %load, 1 @@ -304,18 +304,18 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_zext_in_reg_i16_offset_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_add_i32 s0, s0, 1 -; GFX6-NEXT: s_and_b32 s0, s0, 0xffff -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x80008 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_add_i32 s3, s3, 1 +; GFX6-NEXT: s_and_b32 s3, s3, 0xffff +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x80008 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %load = load i32, i32 addrspace(1)* %in, align 4 %add = add i32 %load, 1 @@ -328,16 +328,16 @@ define amdgpu_kernel void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_test_1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 0, i32 1) @@ -348,17 +348,17 @@ define amdgpu_kernel void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_test_2: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s0, s0, 31 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x80000 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_lshl_b32 s3, s3, 31 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x80000 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 @@ -370,17 +370,17 @@ define amdgpu_kernel void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_test_3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s0, s0, 31 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_lshl_b32 s3, s3, 31 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 @@ -392,17 +392,17 @@ define amdgpu_kernel void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_test_4: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x10000 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x1001f -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x10000 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x1001f +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 @@ -415,17 +415,17 @@ define amdgpu_kernel void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_test_5: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x10000 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_bfe_i32 s3, s3, 0x10000 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 @@ -438,17 +438,17 @@ define amdgpu_kernel void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_test_6: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s0, s0, 31 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x1f0001 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_lshl_b32 s3, s3, 31 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x1f0001 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 @@ -460,17 +460,17 @@ define amdgpu_kernel void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_test_7: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s0, s0, 31 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x1f0000 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_lshl_b32 s3, s3, 31 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x1f0000 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 @@ -482,17 +482,17 @@ define amdgpu_kernel void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_test_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s0, s0, 31 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x1001f -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_lshl_b32 s3, s3, 31 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x1001f +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 @@ -504,16 +504,16 @@ define amdgpu_kernel void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_test_9: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x1001f -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x1001f +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 31, i32 1) @@ -524,16 +524,16 @@ define amdgpu_kernel void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_test_10: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x1f0001 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x1f0001 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 1, i32 31) @@ -544,16 +544,16 @@ define amdgpu_kernel void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_test_11: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x180008 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x180008 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 8, i32 24) @@ -564,16 +564,16 @@ define amdgpu_kernel void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_test_12: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x80018 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x80018 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 24, i32 8) @@ -585,17 +585,17 @@ define amdgpu_kernel void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_test_13: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s0, s0, 31 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x1001f -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_ashr_i32 s3, s3, 31 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x1001f +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %shl = ashr i32 %x, 31 @@ -606,17 +606,17 @@ define amdgpu_kernel void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_test_14: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s0, s0, 31 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x1001f -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_lshr_b32 s3, s3, 31 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x1001f +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %shl = lshr i32 %x, 31 @@ -948,21 +948,21 @@ define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0, ; GFX6-LABEL: simplify_bfe_u32_multi_use_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s0, s0, 63 -; GFX6-NEXT: s_bfe_u32 s1, s0, 0x20002 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_and_b32 s8, s8, 63 +; GFX6-NEXT: s_bfe_u32 s9, s8, 0x20002 +; GFX6-NEXT: v_mov_b32_e32 v1, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 ; GFX6-NEXT: buffer_store_dword v1, off, s[4:7], 0 -; GFX6-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 { @@ -977,14 +977,14 @@ define amdgpu_kernel void @lshr_and(i32 addrspace(1)* %out, i32 %a) #0 { ; GFX6-LABEL: lshr_and: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dword s3, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x30006 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x30006 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %b = lshr i32 %a, 6 %c = and i32 %b, 7 @@ -995,16 +995,16 @@ define amdgpu_kernel void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { ; GFX6-LABEL: v_lshr_and: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb -; GFX6-NEXT: s_load_dword s0, s[0:1], 0xc -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dword s3, s[0:1], 0xb +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xc +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s0, s2, s0 -; GFX6-NEXT: s_and_b32 s0, s0, 7 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_lshr_b32 s3, s3, s4 +; GFX6-NEXT: s_and_b32 s3, s3, 7 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %c = lshr i32 %a, %b %d = and i32 %c, 7 @@ -1015,14 +1015,14 @@ define amdgpu_kernel void @and_lshr(i32 addrspace(1)* %out, i32 %a) #0 { ; GFX6-LABEL: and_lshr: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dword s3, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x30006 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x30006 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %b = and i32 %a, 448 %c = lshr i32 %b, 6 @@ -1033,14 +1033,14 @@ define amdgpu_kernel void @and_lshr2(i32 addrspace(1)* %out, i32 %a) #0 { ; GFX6-LABEL: and_lshr2: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dword s3, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x30006 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x30006 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %b = and i32 %a, 511 %c = lshr i32 %b, 6 @@ -1051,14 +1051,14 @@ define amdgpu_kernel void @shl_lshr(i32 addrspace(1)* %out, i32 %a) #0 { ; GFX6-LABEL: shl_lshr: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dword s3, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x150002 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x150002 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %b = shl i32 %a, 9 %c = lshr i32 %b, 11 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -5,15 +5,15 @@ define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2) { ; GFX8-LABEL: dpp_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll @@ -69,45 +69,48 @@ ; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s6, 0 ; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NOUNALIGNED-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:1 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v5, v[0:1], s[4:7], 0 addr64 offset:3 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v6, v[0:1], s[4:7], 0 addr64 offset:4 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v7, v[0:1], s[4:7], 0 addr64 offset:5 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v8, v[0:1], s[4:7], 0 addr64 offset:6 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v9, v[0:1], s[4:7], 0 addr64 offset:7 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v10, v[0:1], s[4:7], 0 addr64 offset:8 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v11, v[0:1], s[4:7], 0 addr64 offset:9 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v12, v[0:1], s[4:7], 0 addr64 offset:10 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:11 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 offset:1 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:2 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:3 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v5, v[0:1], s[4:7], 0 addr64 offset:5 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v6, v[0:1], s[4:7], 0 addr64 offset:6 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v7, v[0:1], s[4:7], 0 addr64 offset:7 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v8, v[0:1], s[4:7], 0 addr64 offset:9 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v9, v[0:1], s[4:7], 0 addr64 offset:10 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v10, v[0:1], s[4:7], 0 addr64 offset:11 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v11, v[0:1], s[4:7], 0 addr64 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v12, v[0:1], s[4:7], 0 addr64 offset:4 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:8 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(11) +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v2 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 24, v4 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8) -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 24, v5 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 8, v5 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(7) +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6) -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 24, v7 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 8, v8 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 24, v9 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 24, v10 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 8, v11 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v11, v1 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v4, v12, v4 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v12, 24, v0 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v2, v1 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v6, v5 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v10, v9 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v7 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v2, v11 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v8 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v2, v12 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v4, v0, v8 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v1, v3 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v2, v6 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v4, v9 ; GFX7-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(4)* %ptr, align 1 ret <3 x i32> %load @@ -154,21 +157,24 @@ ; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s6, 0 ; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NOUNALIGNED-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:10 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:6 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:4 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v7, v[0:1], s[4:7], 0 addr64 offset:8 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v6, v1 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v2, v1 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v4, v3 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v6, v5 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX7-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(4)* %ptr, align 2 ret <3 x i32> %load @@ -399,45 +405,48 @@ ; GFX7-NOUNALIGNED: ; %bb.0: ; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s2, -1 ; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:1 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v2, off, s[0:3], 0 offset:2 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v3, off, s[0:3], 0 offset:3 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v4, off, s[0:3], 0 offset:4 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v5, off, s[0:3], 0 offset:5 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v6, off, s[0:3], 0 offset:6 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v7, off, s[0:3], 0 offset:7 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v8, off, s[0:3], 0 offset:8 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v9, off, s[0:3], 0 offset:9 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v10, off, s[0:3], 0 offset:10 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v11, off, s[0:3], 0 offset:11 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:1 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:2 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v2, off, s[0:3], 0 offset:3 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v3, off, s[0:3], 0 offset:5 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v4, off, s[0:3], 0 offset:6 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v5, off, s[0:3], 0 offset:7 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v6, off, s[0:3], 0 offset:9 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v7, off, s[0:3], 0 offset:10 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v8, off, s[0:3], 0 offset:11 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v9, off, s[0:3], 0 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v10, off, s[0:3], 0 offset:4 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v11, off, s[0:3], 0 offset:8 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(11) +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8) -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(7) +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6) -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v4, v5 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v9, v0 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v4, v8, v9 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v3, v10, v3 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v6, v11, v6 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v3, v4 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v3, v6, v7 ; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v6 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v4, v10 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v7 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v2, v11 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v3, v8 ; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 ; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 @@ -492,21 +501,24 @@ ; GFX7-NOUNALIGNED: ; %bb.0: ; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s2, -1 ; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:2 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:4 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:6 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:8 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v5, off, s[0:3], 0 offset:10 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:6 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:10 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v3, off, s[0:3], 0 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:4 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v5, off, s[0:3], 0 offset:8 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v5, v2 ; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 ; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll @@ -54,19 +54,19 @@ ; GFX9-NEXT: v_or3_b32 v3, v1, v2, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-NEXT: v_lshl_or_b32 v1, v6, 8, v5 -; GFX9-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v8 -; GFX9-NEXT: v_or3_b32 v1, v1, v2, v4 -; GFX9-NEXT: ds_read_u8 v2, v0 offset:8 -; GFX9-NEXT: ds_read_u8 v4, v0 offset:9 -; GFX9-NEXT: ds_read_u8 v5, v0 offset:10 +; GFX9-NEXT: ds_read_u8 v4, v0 offset:8 +; GFX9-NEXT: ds_read_u8 v5, v0 offset:9 +; GFX9-NEXT: ds_read_u8 v6, v0 offset:10 ; GFX9-NEXT: ds_read_u8 v0, v0 offset:11 +; GFX9-NEXT: s_waitcnt lgkmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GFX9-NEXT: s_waitcnt lgkmcnt(4) +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v8 +; GFX9-NEXT: v_or3_b32 v1, v1, v2, v7 ; GFX9-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-NEXT: v_lshl_or_b32 v2, v4, 8, v2 +; GFX9-NEXT: v_lshl_or_b32 v2, v5, 8, v4 ; GFX9-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX9-NEXT: v_or3_b32 v2, v2, v4, v0 @@ -99,18 +99,18 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(1) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v7 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v8 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: ds_read_u8 v2, v0 offset:8 ; GFX7-NEXT: ds_read_u8 v4, v0 offset:9 -; GFX7-NEXT: ds_read_u8 v5, v0 offset:10 +; GFX7-NEXT: ds_read_u8 v6, v0 offset:10 ; GFX7-NEXT: ds_read_u8 v0, v0 offset:11 +; GFX7-NEXT: s_waitcnt lgkmcnt(4) +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v8 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX7-NEXT: s_waitcnt lgkmcnt(2) ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(1) -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll @@ -160,18 +160,18 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(1) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v7 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v8 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: ds_read_u8 v2, v0 offset:8 ; GFX7-NEXT: ds_read_u8 v4, v0 offset:9 -; GFX7-NEXT: ds_read_u8 v5, v0 offset:10 +; GFX7-NEXT: ds_read_u8 v6, v0 offset:10 ; GFX7-NEXT: ds_read_u8 v0, v0 offset:11 +; GFX7-NEXT: s_waitcnt lgkmcnt(4) +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v8 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX7-NEXT: s_waitcnt lgkmcnt(2) ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(1) -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -106,18 +106,18 @@ ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, gv2@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, gv2@gotpcrel32@hi+12 -; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, gv3@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, gv3@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, gv3@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, gv3@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v0, v0, s[4:5] +; GFX9-NEXT: global_store_dword v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB1_2: ; %Flow ; GFX9-NEXT: s_xor_b32 s0, s0, -1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll @@ -27,22 +27,23 @@ ; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB0_3 ; GCN-NEXT: ; %bb.2: ; %bb.1 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GCN-NEXT: s_load_dword s8, s[4:5], 0x10 -; GCN-NEXT: s_add_u32 s4, s32, 0x1000 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x10 +; GCN-NEXT: s_add_u32 s7, s32, 0x1000 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NEXT: v_mov_b32_e32 v3, 1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s5, s8, 2 -; GCN-NEXT: s_add_u32 s4, s4, s5 +; GCN-NEXT: s_lshl_b32 s6, s6, 2 +; GCN-NEXT: s_add_u32 s6, s7, s6 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_u32_e32 v0, v2, v0 -; GCN-NEXT: global_store_dword v1, v0, s[6:7] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_store_dword v1, v0, s[4:5] ; GCN-NEXT: .LBB0_3: ; %bb.2 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: global_store_dword v[0:1], v0, off @@ -93,23 +94,24 @@ ; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB1_2 ; GCN-NEXT: ; %bb.1: ; %bb.0 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GCN-NEXT: s_load_dword s8, s[4:5], 0xc -; GCN-NEXT: s_add_u32 s4, s32, 0x1000 -; GCN-NEXT: s_and_b32 s4, s4, 0xfffff000 +; GCN-NEXT: s_load_dword s6, s[4:5], 0xc +; GCN-NEXT: s_add_u32 s7, s32, 0x1000 +; GCN-NEXT: s_and_b32 s7, s7, 0xfffff000 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s5, s8, 2 +; GCN-NEXT: s_lshl_b32 s6, s6, 2 ; GCN-NEXT: v_mov_b32_e32 v3, 1 -; GCN-NEXT: s_add_u32 s4, s4, s5 +; GCN-NEXT: s_add_u32 s6, s7, s6 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_u32_e32 v0, v2, v0 -; GCN-NEXT: global_store_dword v1, v0, s[6:7] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_store_dword v1, v0, s[4:5] ; GCN-NEXT: .LBB1_2: ; %bb.1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: global_store_dword v[0:1], v0, off diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -6,31 +6,31 @@ define amdgpu_kernel void @sdivrem_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) { ; GFX8-LABEL: sdivrem_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_ashr_i32 s6, s1, 31 -; GFX8-NEXT: s_add_i32 s1, s1, s6 -; GFX8-NEXT: s_xor_b32 s7, s1, s6 +; GFX8-NEXT: s_ashr_i32 s8, s7, 31 +; GFX8-NEXT: s_add_i32 s0, s7, s8 +; GFX8-NEXT: s_xor_b32 s7, s0, s8 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX8-NEXT: s_sub_i32 s1, 0, s7 -; GFX8-NEXT: s_ashr_i32 s8, s0, 31 -; GFX8-NEXT: s_add_i32 s0, s0, s8 +; GFX8-NEXT: s_sub_i32 s0, 0, s7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_xor_b32 s9, s0, s8 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_lo_u32 v1, s1, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX8-NEXT: s_xor_b32 s4, s8, s6 +; GFX8-NEXT: s_ashr_i32 s4, s6, 31 +; GFX8-NEXT: s_add_i32 s5, s6, s4 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX8-NEXT: s_xor_b32 s5, s5, s4 +; GFX8-NEXT: s_xor_b32 s6, s4, s8 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_mul_hi_u32 v2, s9, v0 +; GFX8-NEXT: v_mul_hi_u32 v2, s5, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s9, v3 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s5, v3 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 @@ -39,13 +39,13 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s4, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s8, v3 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s4, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s4, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v3 ; GFX8-NEXT: s_endpgm @@ -746,13 +746,13 @@ ; GFX9-LABEL: sdivrem_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x18 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s10, s6, 31 ; GFX9-NEXT: s_add_i32 s0, s6, s10 ; GFX9-NEXT: s_xor_b32 s6, s0, s10 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x10 ; GFX9-NEXT: s_ashr_i32 s5, s7, 31 ; GFX9-NEXT: s_add_i32 s7, s7, s5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -762,7 +762,6 @@ ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s8, 31 ; GFX9-NEXT: s_add_i32 s8, s8, s4 ; GFX9-NEXT: v_mul_lo_u32 v2, s11, v0 @@ -815,6 +814,7 @@ ; GFX9-NEXT: v_xor_b32_e32 v3, s11, v3 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_subrev_u32_e32 v3, s11, v3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] ; GFX9-NEXT: s_endpgm @@ -906,9 +906,9 @@ ; GFX8-LABEL: sdivrem_v4i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 ; GFX8-NEXT: v_mov_b32_e32 v3, 0x4f7ffffe +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s12, s0, 31 ; GFX8-NEXT: s_add_i32 s0, s0, s12 @@ -920,14 +920,14 @@ ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_xor_b32 s16, s0, s15 ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s16 -; GFX8-NEXT: s_ashr_i32 s14, s4, 31 +; GFX8-NEXT: s_ashr_i32 s14, s8, 31 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: s_add_i32 s0, s4, s14 +; GFX8-NEXT: s_add_i32 s0, s8, s14 ; GFX8-NEXT: s_xor_b32 s0, s0, s14 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX8-NEXT: s_ashr_i32 s4, s5, 31 +; GFX8-NEXT: s_ashr_i32 s8, s9, 31 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 @@ -947,21 +947,21 @@ ; GFX8-NEXT: s_sub_i32 s0, 0, s16 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_mul_lo_u32 v4, s0, v1 -; GFX8-NEXT: s_add_i32 s1, s5, s4 -; GFX8-NEXT: s_xor_b32 s1, s1, s4 +; GFX8-NEXT: s_add_i32 s1, s9, s8 +; GFX8-NEXT: s_xor_b32 s1, s1, s8 ; GFX8-NEXT: s_xor_b32 s0, s14, s12 ; GFX8-NEXT: v_mul_hi_u32 v4, v1, v4 ; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_xor_b32_e32 v2, s14, v2 -; GFX8-NEXT: s_ashr_i32 s5, s2, 31 +; GFX8-NEXT: s_ashr_i32 s9, s2, 31 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v4 ; GFX8-NEXT: v_mul_hi_u32 v1, s1, v1 ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0 ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s14, v2 ; GFX8-NEXT: v_mul_lo_u32 v5, v1, s16 -; GFX8-NEXT: s_add_i32 s0, s2, s5 -; GFX8-NEXT: s_xor_b32 s2, s0, s5 -; GFX8-NEXT: s_ashr_i32 s12, s6, 31 +; GFX8-NEXT: s_add_i32 s0, s2, s9 +; GFX8-NEXT: s_xor_b32 s2, s0, s9 +; GFX8-NEXT: s_ashr_i32 s12, s10, 31 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s1, v5 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 1, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s16, v2 @@ -979,20 +979,20 @@ ; GFX8-NEXT: s_sub_i32 s0, 0, s2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GFX8-NEXT: v_mul_lo_u32 v6, s0, v5 -; GFX8-NEXT: s_add_i32 s1, s6, s12 +; GFX8-NEXT: s_add_i32 s1, s10, s12 ; GFX8-NEXT: s_xor_b32 s1, s1, s12 -; GFX8-NEXT: s_xor_b32 s0, s4, s15 +; GFX8-NEXT: s_xor_b32 s0, s8, s15 ; GFX8-NEXT: v_mul_hi_u32 v6, v5, v6 -; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, s8, v2 ; GFX8-NEXT: v_xor_b32_e32 v1, s0, v1 ; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, s0, v1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v6 ; GFX8-NEXT: v_mul_hi_u32 v6, s1, v5 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v2 -; GFX8-NEXT: s_ashr_i32 s4, s3, 31 +; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s8, v2 +; GFX8-NEXT: s_ashr_i32 s8, s3, 31 ; GFX8-NEXT: v_mul_lo_u32 v7, v6, s2 -; GFX8-NEXT: s_add_i32 s0, s3, s4 -; GFX8-NEXT: s_xor_b32 s3, s0, s4 +; GFX8-NEXT: s_add_i32 s0, s3, s8 +; GFX8-NEXT: s_xor_b32 s3, s0, s8 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s1, v7 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 1, v6 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 @@ -1009,12 +1009,12 @@ ; GFX8-NEXT: s_sub_i32 s0, 0, s3 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v2, v7, vcc ; GFX8-NEXT: v_mul_lo_u32 v2, s0, v3 -; GFX8-NEXT: s_ashr_i32 s2, s7, 31 -; GFX8-NEXT: s_add_i32 s1, s7, s2 +; GFX8-NEXT: s_ashr_i32 s2, s11, 31 +; GFX8-NEXT: s_add_i32 s1, s11, s2 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc ; GFX8-NEXT: v_mul_hi_u32 v2, v3, v2 ; GFX8-NEXT: s_xor_b32 s1, s1, s2 -; GFX8-NEXT: s_xor_b32 s0, s12, s5 +; GFX8-NEXT: s_xor_b32 s0, s12, s9 ; GFX8-NEXT: v_xor_b32_e32 v6, s0, v6 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_mul_hi_u32 v3, s1, v2 @@ -1032,17 +1032,17 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v7 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc ; GFX8-NEXT: v_subrev_u32_e64 v8, s[0:1], s3, v7 -; GFX8-NEXT: s_xor_b32 s0, s2, s4 +; GFX8-NEXT: s_xor_b32 s0, s2, s8 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX8-NEXT: v_xor_b32_e32 v3, s0, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, s8 +; GFX8-NEXT: v_mov_b32_e32 v9, s5 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s0, v3 -; GFX8-NEXT: v_mov_b32_e32 v9, s9 +; GFX8-NEXT: v_mov_b32_e32 v8, s4 ; GFX8-NEXT: v_xor_b32_e32 v7, s2, v7 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s2, v7 -; GFX8-NEXT: v_mov_b32_e32 v0, s10 -; GFX8-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NEXT: s_endpgm ; @@ -2353,48 +2353,48 @@ define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out0, i8 addrspace(1)* %out1, i8 %x, i8 %y) { ; GFX8-LABEL: sdiv_i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s1, s0, 0x80008 -; GFX8-NEXT: s_ashr_i32 s6, s1, 31 -; GFX8-NEXT: s_add_i32 s1, s1, s6 -; GFX8-NEXT: s_xor_b32 s7, s1, s6 -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX8-NEXT: s_sub_i32 s1, 0, s7 -; GFX8-NEXT: s_sext_i32_i8 s0, s0 -; GFX8-NEXT: s_ashr_i32 s8, s0, 31 +; GFX8-NEXT: s_bfe_i32 s0, s6, 0x80008 +; GFX8-NEXT: s_ashr_i32 s7, s0, 31 +; GFX8-NEXT: s_add_i32 s0, s0, s7 +; GFX8-NEXT: s_xor_b32 s8, s0, s7 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX8-NEXT: s_sub_i32 s0, 0, s8 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_add_i32 s0, s0, s8 -; GFX8-NEXT: s_xor_b32 s9, s0, s8 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_lo_u32 v1, s1, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX8-NEXT: s_xor_b32 s4, s8, s6 +; GFX8-NEXT: s_sext_i32_i8 s4, s6 +; GFX8-NEXT: s_ashr_i32 s5, s4, 31 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX8-NEXT: s_add_i32 s4, s4, s5 +; GFX8-NEXT: s_xor_b32 s4, s4, s5 +; GFX8-NEXT: s_xor_b32 s6, s5, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_mul_hi_u32 v2, s9, v0 +; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s8 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s9, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 +; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s4, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s8, v3 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s5, v3 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s5, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_byte v[0:1], v3 ; GFX8-NEXT: s_endpgm @@ -2764,48 +2764,48 @@ define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out0, i16 addrspace(1)* %out1, i16 %x, i16 %y) { ; GFX8-LABEL: sdiv_i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s1, s0, 0x100010 -; GFX8-NEXT: s_ashr_i32 s6, s1, 31 -; GFX8-NEXT: s_add_i32 s1, s1, s6 -; GFX8-NEXT: s_xor_b32 s7, s1, s6 -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX8-NEXT: s_sub_i32 s1, 0, s7 -; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_ashr_i32 s8, s0, 31 +; GFX8-NEXT: s_bfe_i32 s0, s6, 0x100010 +; GFX8-NEXT: s_ashr_i32 s7, s0, 31 +; GFX8-NEXT: s_add_i32 s0, s0, s7 +; GFX8-NEXT: s_xor_b32 s8, s0, s7 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX8-NEXT: s_sub_i32 s0, 0, s8 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_add_i32 s0, s0, s8 -; GFX8-NEXT: s_xor_b32 s9, s0, s8 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_lo_u32 v1, s1, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX8-NEXT: s_xor_b32 s4, s8, s6 +; GFX8-NEXT: s_sext_i32_i16 s4, s6 +; GFX8-NEXT: s_ashr_i32 s5, s4, 31 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX8-NEXT: s_add_i32 s4, s4, s5 +; GFX8-NEXT: s_xor_b32 s4, s4, s5 +; GFX8-NEXT: s_xor_b32 s6, s5, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_mul_hi_u32 v2, s9, v0 +; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s8 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s9, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 +; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s4, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s8, v3 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s5, v3 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s5, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_short v[0:1], v3 ; GFX8-NEXT: s_endpgm @@ -3007,8 +3007,8 @@ ; GFX9-NEXT: s_add_i32 s0, s0, s7 ; GFX9-NEXT: s_xor_b32 s8, s0, s7 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s9, s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_mov_b32 s4, 0x100010 ; GFX9-NEXT: s_bfe_i32 s6, s6, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -3176,47 +3176,47 @@ define amdgpu_kernel void @sdivrem_i3(i3 addrspace(1)* %out0, i3 addrspace(1)* %out1, i3 %x, i3 %y) { ; GFX8-LABEL: sdivrem_i3: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s1, s0, 0x30008 -; GFX8-NEXT: s_ashr_i32 s6, s1, 31 -; GFX8-NEXT: s_add_i32 s1, s1, s6 -; GFX8-NEXT: s_xor_b32 s7, s1, s6 -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX8-NEXT: s_sub_i32 s1, 0, s7 -; GFX8-NEXT: s_bfe_i32 s0, s0, 0x30000 -; GFX8-NEXT: s_ashr_i32 s8, s0, 31 +; GFX8-NEXT: s_bfe_i32 s0, s6, 0x30008 +; GFX8-NEXT: s_ashr_i32 s7, s0, 31 +; GFX8-NEXT: s_add_i32 s0, s0, s7 +; GFX8-NEXT: s_xor_b32 s8, s0, s7 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX8-NEXT: s_sub_i32 s0, 0, s8 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_add_i32 s0, s0, s8 -; GFX8-NEXT: s_xor_b32 s9, s0, s8 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_lo_u32 v1, s1, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX8-NEXT: s_xor_b32 s4, s8, s6 +; GFX8-NEXT: s_bfe_i32 s4, s6, 0x30000 +; GFX8-NEXT: s_ashr_i32 s5, s4, 31 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX8-NEXT: s_add_i32 s4, s4, s5 +; GFX8-NEXT: s_xor_b32 s4, s4, s5 +; GFX8-NEXT: s_xor_b32 s6, s5, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_mul_hi_u32 v2, s9, v0 +; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s8 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s9, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 +; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s4, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s8, v3 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s5, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s5, v3 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll @@ -9,28 +9,28 @@ define amdgpu_kernel void @store_lds_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: ds_write_b128 v4, v[0:3] ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: ds_write_b128 v4, v[0:3] ; GFX7-NEXT: s_endpgm ; @@ -54,12 +54,12 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: s_lshr_b32 s0, s4, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: s_lshr_b32 s1, s4, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -106,50 +106,50 @@ ; ; GFX7-LABEL: store_lds_v4i32_align1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_lshr_b32 s5, s0, 8 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: s_lshr_b32 s6, s0, 16 +; GFX7-NEXT: s_lshr_b32 s1, s4, 8 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s2, s4, 16 ; GFX7-NEXT: ds_write_b8 v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-NEXT: s_lshr_b32 s7, s0, 24 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: s_lshr_b32 s3, s4, 24 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:1 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:2 -; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:3 -; GFX7-NEXT: s_lshr_b32 s0, s1, 8 -; GFX7-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-NEXT: s_lshr_b32 s4, s1, 16 +; GFX7-NEXT: s_lshr_b32 s0, s5, 8 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_lshr_b32 s1, s5, 16 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:4 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: s_lshr_b32 s5, s1, 24 +; GFX7-NEXT: s_lshr_b32 s2, s5, 24 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:5 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:6 -; GFX7-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-NEXT: ds_write_b8 v1, v0 offset:7 -; GFX7-NEXT: s_lshr_b32 s0, s2, 8 ; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: s_lshr_b32 s1, s2, 16 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:7 +; GFX7-NEXT: s_lshr_b32 s0, s6, 8 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshr_b32 s1, s6, 16 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:8 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: s_lshr_b32 s4, s2, 24 +; GFX7-NEXT: s_lshr_b32 s2, s6, 24 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:9 ; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:10 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:11 -; GFX7-NEXT: s_lshr_b32 s0, s3, 8 -; GFX7-NEXT: v_mov_b32_e32 v0, s3 -; GFX7-NEXT: s_lshr_b32 s1, s3, 16 +; GFX7-NEXT: s_lshr_b32 s0, s7, 8 +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: s_lshr_b32 s1, s7, 16 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:12 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: s_lshr_b32 s2, s3, 24 +; GFX7-NEXT: s_lshr_b32 s2, s7, 24 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:13 ; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:14 @@ -216,12 +216,12 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: s_lshr_b32 s0, s4, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: ds_write_b16 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:2 @@ -244,28 +244,28 @@ ; ; GFX7-LABEL: store_lds_v4i32_align2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_lshr_b32 s5, s0, 16 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_lshr_b32 s1, s4, 16 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: ds_write_b16 v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-NEXT: ds_write_b16 v1, v0 offset:2 -; GFX7-NEXT: s_lshr_b32 s0, s1, 16 ; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: ds_write_b16 v1, v0 offset:2 +; GFX7-NEXT: s_lshr_b32 s0, s5, 16 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:4 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:6 -; GFX7-NEXT: s_lshr_b32 s0, s2, 16 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: s_lshr_b32 s0, s6, 16 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:8 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:10 -; GFX7-NEXT: s_lshr_b32 s0, s3, 16 -; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: s_lshr_b32 s0, s7, 16 +; GFX7-NEXT: v_mov_b32_e32 v0, s7 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:12 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:14 @@ -306,11 +306,11 @@ define amdgpu_kernel void @store_lds_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 @@ -320,16 +320,16 @@ ; ; GFX7-LABEL: store_lds_v4i32_align4: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v2, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 ; GFX7-NEXT: ds_write2_b32 v1, v0, v2 offset0:2 offset1:3 ; GFX7-NEXT: s_endpgm ; @@ -354,28 +354,28 @@ define amdgpu_kernel void @store_lds_v4i32_align8(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align8: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; GFX7-NEXT: s_endpgm ; @@ -399,28 +399,28 @@ define amdgpu_kernel void @store_lds_v4i32_align16(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: ds_write_b128 v4, v[0:3] ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: ds_write_b128 v4, v[0:3] ; GFX7-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll @@ -9,26 +9,26 @@ define amdgpu_kernel void @store_lds_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s12 ; GFX9-NEXT: v_mov_b32_e32 v1, s13 ; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: ds_write_b96 v3, v[0:2] ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, s4 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s12 +; GFX7-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-NEXT: v_mov_b32_e32 v2, s14 +; GFX7-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-NEXT: ds_write_b96 v3, v[0:2] ; GFX7-NEXT: s_endpgm ; @@ -51,12 +51,12 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: s_lshr_b32 s0, s12, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: s_lshr_b32 s1, s12, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -92,43 +92,43 @@ ; ; GFX7-LABEL: store_lds_v3i32_align1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_lshr_b32 s3, s0, 8 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: s_lshr_b32 s5, s0, 16 +; GFX7-NEXT: s_lshr_b32 s1, s12, 8 +; GFX7-NEXT: v_mov_b32_e32 v0, s12 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s2, s12, 16 ; GFX7-NEXT: ds_write_b8 v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s3 -; GFX7-NEXT: s_lshr_b32 s6, s0, 24 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: s_lshr_b32 s3, s12, 24 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:1 -; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:2 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:3 -; GFX7-NEXT: s_lshr_b32 s0, s1, 8 -; GFX7-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-NEXT: s_lshr_b32 s3, s1, 16 +; GFX7-NEXT: s_lshr_b32 s0, s13, 8 +; GFX7-NEXT: v_mov_b32_e32 v0, s13 +; GFX7-NEXT: s_lshr_b32 s1, s13, 16 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:4 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: s_lshr_b32 s4, s1, 24 +; GFX7-NEXT: s_lshr_b32 s2, s13, 24 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:5 -; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:6 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: ds_write_b8 v1, v0 offset:7 -; GFX7-NEXT: s_lshr_b32 s0, s2, 8 ; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: s_lshr_b32 s1, s2, 16 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:7 +; GFX7-NEXT: s_lshr_b32 s0, s14, 8 +; GFX7-NEXT: v_mov_b32_e32 v0, s14 +; GFX7-NEXT: s_lshr_b32 s1, s14, 16 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:8 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: s_lshr_b32 s3, s2, 24 +; GFX7-NEXT: s_lshr_b32 s2, s14, 24 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:9 ; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:10 -; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:11 ; GFX7-NEXT: s_endpgm ; @@ -180,12 +180,12 @@ define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: s_lshr_b32 s0, s12, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: ds_write_b16 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:2 @@ -203,23 +203,23 @@ ; ; GFX7-LABEL: store_lds_v3i32_align2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_lshr_b32 s3, s0, 16 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_lshr_b32 s1, s12, 16 +; GFX7-NEXT: v_mov_b32_e32 v0, s12 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: ds_write_b16 v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s3 -; GFX7-NEXT: ds_write_b16 v1, v0 offset:2 -; GFX7-NEXT: s_lshr_b32 s0, s1, 16 ; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: ds_write_b16 v1, v0 offset:2 +; GFX7-NEXT: s_lshr_b32 s0, s13, 16 +; GFX7-NEXT: v_mov_b32_e32 v0, s13 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:4 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:6 -; GFX7-NEXT: s_lshr_b32 s0, s2, 16 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: s_lshr_b32 s0, s14, 16 +; GFX7-NEXT: v_mov_b32_e32 v0, s14 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:8 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:10 @@ -255,12 +255,12 @@ define amdgpu_kernel void @store_lds_v3i32_align4(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s12 ; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s14 ; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 ; GFX9-NEXT: ds_write_b32 v2, v3 offset:8 @@ -268,14 +268,14 @@ ; ; GFX7-LABEL: store_lds_v3i32_align4: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v3, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s12 +; GFX7-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s14 ; GFX7-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 ; GFX7-NEXT: ds_write_b32 v2, v3 offset:8 ; GFX7-NEXT: s_endpgm @@ -300,12 +300,12 @@ define amdgpu_kernel void @store_lds_v3i32_align8(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s12 ; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s14 ; GFX9-NEXT: ds_write_b64 v2, v[0:1] ; GFX9-NEXT: ds_write_b32 v2, v3 offset:8 @@ -313,14 +313,14 @@ ; ; GFX7-LABEL: store_lds_v3i32_align8: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v3, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s12 +; GFX7-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s14 ; GFX7-NEXT: ds_write_b64 v2, v[0:1] ; GFX7-NEXT: ds_write_b32 v2, v3 offset:8 ; GFX7-NEXT: s_endpgm @@ -345,26 +345,26 @@ define amdgpu_kernel void @store_lds_v3i32_align16(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s12 ; GFX9-NEXT: v_mov_b32_e32 v1, s13 ; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: ds_write_b96 v3, v[0:2] ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, s4 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s12 +; GFX7-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-NEXT: v_mov_b32_e32 v2, s14 +; GFX7-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-NEXT: ds_write_b96 v3, v[0:2] ; GFX7-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -556,8 +556,8 @@ ; GFX8-LABEL: udivrem_v2i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x18 -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x10 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s3 @@ -574,13 +574,13 @@ ; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX8-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, s8, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 -; GFX8-NEXT: v_mul_hi_u32 v1, s5, v1 +; GFX8-NEXT: v_mul_hi_u32 v1, s9, v1 ; GFX8-NEXT: v_mul_lo_u32 v2, v0, s2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 ; GFX8-NEXT: v_mul_lo_u32 v4, v1, s3 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s4, v2 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s8, v2 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s2, v2 @@ -590,7 +590,7 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s5, v4 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s9, v4 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc @@ -601,11 +601,11 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s10 -; GFX8-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; @@ -727,9 +727,9 @@ ; GFX8-LABEL: udivrem_v4i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x20 +; GFX8-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x10 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x4f7ffffe -; GFX8-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s9 @@ -747,13 +747,13 @@ ; GFX8-NEXT: v_mul_hi_u32 v3, v0, v3 ; GFX8-NEXT: v_mul_hi_u32 v4, v1, v4 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 -; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, s12, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v4 -; GFX8-NEXT: v_mul_hi_u32 v1, s5, v1 +; GFX8-NEXT: v_mul_hi_u32 v1, s13, v1 ; GFX8-NEXT: v_mul_lo_u32 v3, v0, s8 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v0 ; GFX8-NEXT: v_mul_lo_u32 v5, v1, s9 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s12, v3 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 @@ -763,7 +763,7 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s5, v5 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s13, v5 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v5, v6 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 1, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 @@ -783,7 +783,7 @@ ; GFX8-NEXT: v_mul_f32_e32 v2, v6, v2 ; GFX8-NEXT: v_add_u32_e64 v5, s[0:1], v5, v7 ; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX8-NEXT: v_mul_hi_u32 v7, s6, v5 +; GFX8-NEXT: v_mul_hi_u32 v7, s14, v5 ; GFX8-NEXT: v_subrev_u32_e64 v5, s[0:1], s9, v3 ; GFX8-NEXT: s_sub_i32 s0, 0, s11 ; GFX8-NEXT: v_mul_lo_u32 v6, s0, v2 @@ -791,20 +791,20 @@ ; GFX8-NEXT: v_mul_lo_u32 v3, v7, s10 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 1, v7 ; GFX8-NEXT: v_mul_hi_u32 v6, v2, v6 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s6, v3 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s14, v3 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX8-NEXT: v_subrev_u32_e64 v8, s[0:1], s10, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; GFX8-NEXT: v_mul_hi_u32 v8, s7, v2 +; GFX8-NEXT: v_mul_hi_u32 v8, s15, v2 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v7 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc ; GFX8-NEXT: v_mul_lo_u32 v7, v8, s11 ; GFX8-NEXT: v_subrev_u32_e64 v6, s[0:1], s10, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v3, v6, vcc -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s7, v7 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s15, v7 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 1, v8 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc @@ -815,12 +815,12 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX8-NEXT: v_subrev_u32_e64 v7, s[0:1], s11, v8 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc -; GFX8-NEXT: v_mov_b32_e32 v8, s12 -; GFX8-NEXT: v_mov_b32_e32 v9, s13 +; GFX8-NEXT: v_mov_b32_e32 v9, s5 +; GFX8-NEXT: v_mov_b32_e32 v8, s4 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s14 -; GFX8-NEXT: v_mov_b32_e32 v1, s15 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NEXT: s_endpgm ; @@ -1884,34 +1884,34 @@ define amdgpu_kernel void @udiv_i8(i8 addrspace(1)* %out0, i8 addrspace(1)* %out1, i8 %x, i8 %y) { ; GFX8-LABEL: udiv_i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s6, s0, 0x80008 -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s6 +; GFX8-NEXT: s_bfe_u32 s7, s6, 0x80008 +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_sub_i32 s1, 0, s6 -; GFX8-NEXT: s_and_b32 s7, s0, 0xff +; GFX8-NEXT: s_sub_i32 s0, 0, s7 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_lo_u32 v1, s1, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_and_b32 s4, s6, 0xff ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_mul_hi_u32 v2, s7, v0 +; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s6 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s7, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s6, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s6, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc @@ -1996,8 +1996,8 @@ define amdgpu_kernel void @udivrem_v2i8(<2 x i8> addrspace(1)* %out0, <2 x i8> addrspace(1)* %out1, <2 x i8> %x, <2 x i8> %y) { ; GFX8-LABEL: udivrem_v2i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s2, s0, 0x80010 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 @@ -2015,12 +2015,12 @@ ; GFX8-NEXT: v_mul_lo_u32 v3, s1, v1 ; GFX8-NEXT: s_and_b32 s1, s0, 0xff ; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x80008 ; GFX8-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_mul_hi_u32 v0, s1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 -; GFX8-NEXT: v_mul_hi_u32 v1, s4, v1 +; GFX8-NEXT: v_mul_hi_u32 v1, s8, v1 ; GFX8-NEXT: v_mul_lo_u32 v2, v0, s2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s1, v2 @@ -2034,7 +2034,7 @@ ; GFX8-NEXT: v_mul_lo_u32 v3, v1, s3 ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s8, v3 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc @@ -2049,14 +2049,14 @@ ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_short v[0:1], v4 ; GFX8-NEXT: v_and_b32_e32 v0, s0, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v0, s10 -; GFX8-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -2195,34 +2195,34 @@ define amdgpu_kernel void @udiv_i16(i16 addrspace(1)* %out0, i16 addrspace(1)* %out1, i16 %x, i16 %y) { ; GFX8-LABEL: udiv_i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s6, s0, 16 -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX8-NEXT: s_sub_i32 s1, 0, s6 -; GFX8-NEXT: s_and_b32 s7, s0, 0xffff +; GFX8-NEXT: s_lshr_b32 s7, s6, 16 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX8-NEXT: s_sub_i32 s0, 0, s7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_lo_u32 v1, s1, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_and_b32 s4, s6, 0xffff ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_mul_hi_u32 v2, s7, v0 +; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s6 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s7, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s6, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s6, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc @@ -2375,37 +2375,38 @@ ; ; GFX9-LABEL: udivrem_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s1, s[4:5], 0x14 -; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x14 +; GFX9-NEXT: s_mov_b32 s1, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s7, s1, s0 +; GFX9-NEXT: s_and_b32 s7, s0, s1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_lshr_b32 s6, s1, 16 +; GFX9-NEXT: s_lshr_b32 s6, s0, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: s_sub_i32 s1, 0, s7 +; GFX9-NEXT: s_sub_i32 s2, 0, s7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s2, 0, s6 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX9-NEXT: s_sub_i32 s3, 0, s6 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_lo_u32 v2, s1, v0 -; GFX9-NEXT: s_load_dword s1, s[4:5], 0x10 -; GFX9-NEXT: v_mul_lo_u32 v3, s2, v1 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s9, s1, s0 +; GFX9-NEXT: s_and_b32 s9, s0, s1 +; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 +; GFX9-NEXT: s_lshr_b32 s8, s0, 16 +; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: s_lshr_b32 s8, s1, 16 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 ; GFX9-NEXT: v_mul_hi_u32 v1, s8, v1 ; GFX9-NEXT: v_mul_lo_u32 v2, v0, s7 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mul_lo_u32 v3, v1, s6 +; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 ; GFX9-NEXT: v_sub_u32_e32 v2, s9, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc @@ -2416,7 +2417,6 @@ ; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v2 -; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX9-NEXT: v_subrev_u32_e32 v4, s6, v3 @@ -2512,34 +2512,34 @@ define amdgpu_kernel void @udivrem_i3(i3 addrspace(1)* %out0, i3 addrspace(1)* %out1, i3 %x, i3 %y) { ; GFX8-LABEL: udivrem_i3: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s6, s0, 0x30008 -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s6 +; GFX8-NEXT: s_bfe_u32 s7, s6, 0x30008 +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_sub_i32 s1, 0, s6 -; GFX8-NEXT: s_and_b32 s7, s0, 7 +; GFX8-NEXT: s_sub_i32 s0, 0, s7 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_lo_u32 v1, s1, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_and_b32 s4, s6, 7 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_mul_hi_u32 v2, s7, v0 +; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s6 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s7, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s6, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s6, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: flat_store_byte v[0:1], v2 @@ -2630,27 +2630,27 @@ define amdgpu_kernel void @udivrem_i27(i27 addrspace(1)* %out0, i27 addrspace(1)* %out1, i27 %x, i27 %y) { ; GFX8-LABEL: udivrem_i27: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 -; GFX8-NEXT: s_mov_b32 s6, 0x7ffffff +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX8-NEXT: s_mov_b32 s8, 0x7ffffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s7, s1, s6 +; GFX8-NEXT: s_and_b32 s7, s7, s8 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX8-NEXT: s_sub_i32 s1, 0, s7 -; GFX8-NEXT: s_and_b32 s8, s0, s6 +; GFX8-NEXT: s_sub_i32 s0, 0, s7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_lo_u32 v1, s1, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_and_b32 s4, s6, s8 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_mul_hi_u32 v2, s8, v0 +; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s8, v3 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 @@ -2659,11 +2659,11 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 -; GFX8-NEXT: v_and_b32_e32 v2, s6, v2 +; GFX8-NEXT: v_and_b32_e32 v2, s8, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_and_b32_e32 v2, s6, v3 +; GFX8-NEXT: v_and_b32_e32 v2, s8, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll --- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll +++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll @@ -417,14 +417,14 @@ ; FIXEDABI-SDAG-LABEL: addrspacecast_requires_queue_ptr: ; FIXEDABI-SDAG: ; %bb.0: ; FIXEDABI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FIXEDABI-SDAG-NEXT: s_load_dword s4, s[6:7], 0x40 -; FIXEDABI-SDAG-NEXT: s_load_dword s5, s[6:7], 0x44 +; FIXEDABI-SDAG-NEXT: s_load_dword s4, s[6:7], 0x44 +; FIXEDABI-SDAG-NEXT: s_load_dword s5, s[6:7], 0x40 ; FIXEDABI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0 ; FIXEDABI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v2, s5 +; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v2, s4 ; FIXEDABI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc ; FIXEDABI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc -; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v0, s5 ; FIXEDABI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, -1, v1 ; FIXEDABI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v0, vcc ; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v0, 1 diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -25,7 +25,7 @@ ; GFX9: s_load_dword [[VAL0:s[0-9]+]] ; GFX9: s_load_dword [[VAL1:s[0-9]+]] ; GFX9: v_mov_b32_e32 [[VVAL1:v[0-9]+]] -; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[VAL0]], [[VVAL1]] +; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[VAL1]], [[VVAL1]] ; VI: s_add_i32 ; VI: s_add_i32 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -1,4 +1,3 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update ; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX6 %s @@ -764,20 +763,20 @@ ; ; GFX6-LABEL: sdiv_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s1, s0, 16 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s1 -; GFX6-NEXT: s_sext_i32_i16 s0, s0 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s0 -; GFX6-NEXT: s_xor_b32 s0, s0, s1 +; GFX6-NEXT: s_ashr_i32 s5, s4, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 +; GFX6-NEXT: s_sext_i32_i16 s4, s4 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GFX6-NEXT: s_xor_b32 s4, s4, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: s_ashr_i32 s0, s0, 30 -; GFX6-NEXT: s_or_b32 s0, s0, 1 -; GFX6-NEXT: v_mov_b32_e32 v3, s0 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 +; GFX6-NEXT: s_or_b32 s4, s4, 1 +; GFX6-NEXT: v_mov_b32_e32 v3, s4 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 @@ -785,13 +784,13 @@ ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s0, s4, 16 @@ -815,8 +814,8 @@ ; ; GFX90A-LABEL: sdiv_i16: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_ashr_i32 s0, s4, 16 @@ -928,8 +927,8 @@ ; ; GFX90A-LABEL: srem_i16: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_ashr_i32 s5, s4, 16 @@ -981,21 +980,21 @@ ; ; GFX6-LABEL: udiv_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_ubyte1_e32 v0, s0 +; GFX6-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 ; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 ; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc -; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_i8: @@ -1108,8 +1107,8 @@ ; ; GFX90A-LABEL: urem_i8: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 @@ -1159,20 +1158,20 @@ ; ; GFX6-LABEL: sdiv_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s1, s0, 0x80008 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s1 -; GFX6-NEXT: s_sext_i32_i8 s0, s0 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s0 -; GFX6-NEXT: s_xor_b32 s0, s0, s1 +; GFX6-NEXT: s_bfe_i32 s5, s4, 0x80008 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 +; GFX6-NEXT: s_sext_i32_i8 s4, s4 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GFX6-NEXT: s_xor_b32 s4, s4, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: s_ashr_i32 s0, s0, 30 -; GFX6-NEXT: s_or_b32 s0, s0, 1 -; GFX6-NEXT: v_mov_b32_e32 v3, s0 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 +; GFX6-NEXT: s_or_b32 s4, s4, 1 +; GFX6-NEXT: v_mov_b32_e32 v3, s4 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 @@ -1180,13 +1179,13 @@ ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s0, s4, 0x80008 @@ -1210,8 +1209,8 @@ ; ; GFX90A-LABEL: sdiv_i8: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_bfe_i32 s0, s4, 0x80008 @@ -1267,66 +1266,65 @@ ; ; GFX6-LABEL: srem_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s1, s0, 0x80008 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s1 -; GFX6-NEXT: s_sext_i32_i8 s3, s0 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s3 -; GFX6-NEXT: s_xor_b32 s1, s3, s1 +; GFX6-NEXT: s_bfe_i32 s2, s4, 0x80008 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX6-NEXT: s_sext_i32_i8 s5, s4 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 +; GFX6-NEXT: s_xor_b32 s2, s5, s2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: s_ashr_i32 s1, s1, 30 -; GFX6-NEXT: s_or_b32 s1, s1, 1 -; GFX6-NEXT: v_mov_b32_e32 v3, s1 +; GFX6-NEXT: s_ashr_i32 s2, s2, 30 +; GFX6-NEXT: s_or_b32 s2, s2, 1 +; GFX6-NEXT: v_mov_b32_e32 v3, s2 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GFX6-NEXT: s_lshr_b32 s2, s0, 8 +; GFX6-NEXT: s_lshr_b32 s3, s4, 8 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s2, s4, 0x80008 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: s_sext_i32_i8 s3, s4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s3 -; GFX9-NEXT: s_xor_b32 s2, s3, s2 +; GFX9-NEXT: s_bfe_i32 s0, s4, 0x80008 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX9-NEXT: s_sext_i32_i8 s1, s4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: s_lshr_b32 s5, s4, 8 -; GFX9-NEXT: s_or_b32 s6, s2, 1 +; GFX9-NEXT: s_or_b32 s6, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s6, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s6, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v2 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: srem_i8: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_bfe_i32 s0, s4, 0x80008 @@ -3234,22 +3232,22 @@ ; ; GFX6-LABEL: udiv_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s8, 0xffff -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s9, s2, s8 +; GFX6-NEXT: s_and_b32 s9, s6, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX6-NEXT: s_lshr_b32 s9, s0, 16 -; GFX6-NEXT: s_and_b32 s0, s0, s8 -; GFX6-NEXT: s_lshr_b32 s2, s2, 16 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s0 +; GFX6-NEXT: s_lshr_b32 s9, s4, 16 +; GFX6-NEXT: s_and_b32 s4, s4, s8 +; GFX6-NEXT: s_lshr_b32 s6, s6, 16 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s6 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s9 -; GFX6-NEXT: s_and_b32 s2, s3, s8 +; GFX6-NEXT: s_and_b32 s6, s7, s8 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 @@ -3260,18 +3258,18 @@ ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc ; GFX6-NEXT: v_mad_f32 v2, -v1, v3, v4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s2 -; GFX6-NEXT: s_lshr_b32 s0, s1, 16 -; GFX6-NEXT: s_lshr_b32 s10, s3, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s8 +; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s6 +; GFX6-NEXT: s_lshr_b32 s4, s5, 16 +; GFX6-NEXT: s_lshr_b32 s10, s7, 16 +; GFX6-NEXT: s_and_b32 s5, s5, s8 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s1 +; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s10 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc ; GFX6-NEXT: v_mul_f32_e32 v1, v5, v6 -; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v3 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mad_f32 v5, -v1, v4, v5 @@ -3290,26 +3288,26 @@ ; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_mov_b32 s8, 0xffff +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s6, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX9-NEXT: s_lshr_b32 s0, s4, 16 -; GFX9-NEXT: s_and_b32 s4, s4, s8 +; GFX9-NEXT: s_and_b32 s8, s6, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX9-NEXT: s_lshr_b32 s1, s4, 16 +; GFX9-NEXT: s_and_b32 s4, s4, s0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s4 ; GFX9-NEXT: s_lshr_b32 s4, s6, 16 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 -; GFX9-NEXT: s_and_b32 s0, s7, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s1 +; GFX9-NEXT: s_and_b32 s1, s7, s0 ; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 @@ -3320,18 +3318,18 @@ ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc ; GFX9-NEXT: v_mad_f32 v3, -v1, v4, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s1 ; GFX9-NEXT: s_lshr_b32 s6, s7, 16 -; GFX9-NEXT: s_and_b32 s0, s5, s8 +; GFX9-NEXT: s_and_b32 s0, s5, s0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s6 -; GFX9-NEXT: s_lshr_b32 s1, s5, 16 +; GFX9-NEXT: s_lshr_b32 s8, s5, 16 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX9-NEXT: v_mul_f32_e32 v1, v6, v7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v4 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mad_f32 v6, -v1, v5, v6 @@ -3354,21 +3352,21 @@ ; ; GFX90A-LABEL: udiv_v4i16: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX90A-NEXT: s_mov_b32 s8, 0xffff +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_mov_b32 s0, 0xffff ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_and_b32 s1, s6, s8 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX90A-NEXT: s_lshr_b32 s0, s4, 16 -; GFX90A-NEXT: s_and_b32 s4, s4, s8 +; GFX90A-NEXT: s_and_b32 s8, s6, s0 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX90A-NEXT: s_lshr_b32 s1, s4, 16 +; GFX90A-NEXT: s_and_b32 s4, s4, s0 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s4 ; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s0 -; GFX90A-NEXT: s_and_b32 s0, s7, s8 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s1 +; GFX90A-NEXT: s_and_b32 s1, s7, s0 ; GFX90A-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v4 ; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 @@ -3379,18 +3377,18 @@ ; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc ; GFX90A-NEXT: v_mad_f32 v3, -v1, v4, v5 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s0 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s1 ; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 -; GFX90A-NEXT: s_and_b32 s0, s5, s8 +; GFX90A-NEXT: s_and_b32 s0, s5, s0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s0 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v5 ; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s6 -; GFX90A-NEXT: s_lshr_b32 s1, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s8, s5, 16 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: v_mul_f32_e32 v1, v6, v7 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v7, s1 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v7, s8 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v8, v4 ; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 ; GFX90A-NEXT: v_mad_f32 v6, -v1, v5, v6 @@ -3510,20 +3508,20 @@ ; ; GFX6-LABEL: urem_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s8, 0xffff -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s9, s2, s8 +; GFX6-NEXT: s_and_b32 s9, s6, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX6-NEXT: s_and_b32 s10, s0, s8 -; GFX6-NEXT: s_lshr_b32 s11, s2, 16 +; GFX6-NEXT: s_and_b32 s10, s4, s8 +; GFX6-NEXT: s_lshr_b32 s11, s6, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s10 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s11 -; GFX6-NEXT: s_lshr_b32 s9, s0, 16 +; GFX6-NEXT: s_lshr_b32 s9, s4, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s9 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 @@ -3536,24 +3534,24 @@ ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v1 ; GFX6-NEXT: v_mad_f32 v1, -v1, v3, v4 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v3 -; GFX6-NEXT: s_and_b32 s2, s3, s8 +; GFX6-NEXT: s_and_b32 s6, s7, s8 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s2 -; GFX6-NEXT: s_and_b32 s2, s1, s8 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 +; GFX6-NEXT: s_and_b32 s6, s5, s8 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s11 -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GFX6-NEXT: s_lshr_b32 s12, s3, 16 +; GFX6-NEXT: s_lshr_b32 s12, s7, 16 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s9, v1 ; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s12 -; GFX6-NEXT: s_lshr_b32 s10, s1, 16 +; GFX6-NEXT: s_lshr_b32 s10, s5, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s10 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 @@ -3564,35 +3562,35 @@ ; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v6 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s3 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 ; GFX6-NEXT: v_mul_lo_u32 v2, v2, s12 ; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s10, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_and_b32_e32 v1, s8, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_mov_b32 s8, 0xffff +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s6, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX9-NEXT: s_and_b32 s9, s4, s8 +; GFX9-NEXT: s_and_b32 s8, s6, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX9-NEXT: s_and_b32 s9, s4, s0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GFX9-NEXT: s_lshr_b32 s9, s6, 16 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s9 -; GFX9-NEXT: s_lshr_b32 s0, s4, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 +; GFX9-NEXT: s_lshr_b32 s1, s4, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s1 ; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 @@ -3604,18 +3602,18 @@ ; GFX9-NEXT: v_mul_f32_e32 v1, v5, v6 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s6 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: s_and_b32 s6, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s7, s0 ; GFX9-NEXT: v_mad_f32 v3, -v1, v4, v5 ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s6 -; GFX9-NEXT: s_and_b32 s6, s5, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s6 +; GFX9-NEXT: s_and_b32 s0, s5, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s10 -; GFX9-NEXT: s_lshr_b32 s1, s5, 16 +; GFX9-NEXT: s_lshr_b32 s8, s5, 16 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v3, v6, v7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v4 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc @@ -3633,9 +3631,9 @@ ; GFX9-NEXT: v_mul_lo_u32 v3, v3, s7 ; GFX9-NEXT: v_mul_lo_u32 v4, v4, s10 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 -; GFX9-NEXT: v_sub_u32_e32 v5, s0, v1 +; GFX9-NEXT: v_sub_u32_e32 v5, s1, v1 ; GFX9-NEXT: v_sub_u32_e32 v1, s5, v3 -; GFX9-NEXT: v_sub_u32_e32 v3, s1, v4 +; GFX9-NEXT: v_sub_u32_e32 v3, s8, v4 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: v_and_b32_e32 v1, v4, v1 ; GFX9-NEXT: v_and_b32_e32 v0, v4, v0 @@ -3646,20 +3644,20 @@ ; ; GFX90A-LABEL: urem_v4i16: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX90A-NEXT: s_mov_b32 s8, 0xffff +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_mov_b32 s0, 0xffff ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_and_b32 s1, s6, s8 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX90A-NEXT: s_and_b32 s9, s4, s8 +; GFX90A-NEXT: s_and_b32 s8, s6, s0 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX90A-NEXT: s_and_b32 s9, s4, s0 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GFX90A-NEXT: s_lshr_b32 s9, s6, 16 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s9 -; GFX90A-NEXT: s_lshr_b32 s0, s4, 16 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s0 +; GFX90A-NEXT: s_lshr_b32 s1, s4, 16 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s1 ; GFX90A-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 ; GFX90A-NEXT: v_mad_f32 v1, -v3, v0, v1 @@ -3672,21 +3670,21 @@ ; GFX90A-NEXT: v_mul_f32_e32 v1, v5, v6 ; GFX90A-NEXT: v_sub_u32_e32 v0, s4, v0 ; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 -; GFX90A-NEXT: s_and_b32 s4, s7, s8 +; GFX90A-NEXT: s_and_b32 s4, s7, s0 ; GFX90A-NEXT: v_mad_f32 v3, -v1, v4, v5 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s4 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX90A-NEXT: s_and_b32 s4, s5, s8 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s4 +; GFX90A-NEXT: s_and_b32 s0, s5, s0 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s0 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v5 ; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s10 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: s_lshr_b32 s1, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s8, s5, 16 ; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s9 -; GFX90A-NEXT: v_sub_u32_e32 v3, s0, v1 +; GFX90A-NEXT: v_sub_u32_e32 v3, s1, v1 ; GFX90A-NEXT: v_mul_f32_e32 v1, v6, v7 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v7, s1 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v7, s8 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v8, v4 ; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 ; GFX90A-NEXT: v_mad_f32 v6, -v1, v5, v6 @@ -3703,7 +3701,7 @@ ; GFX90A-NEXT: v_sub_u32_e32 v1, s5, v1 ; GFX90A-NEXT: v_mul_lo_u32 v4, v4, s10 ; GFX90A-NEXT: v_mov_b32_e32 v5, 0xffff -; GFX90A-NEXT: v_sub_u32_e32 v4, s1, v4 +; GFX90A-NEXT: v_sub_u32_e32 v4, s8, v4 ; GFX90A-NEXT: v_and_b32_e32 v1, v5, v1 ; GFX90A-NEXT: v_and_b32_e32 v0, v5, v0 ; GFX90A-NEXT: v_lshl_or_b32 v1, v4, 16, v1 @@ -3818,18 +3816,18 @@ ; ; GFX6-LABEL: sdiv_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_sext_i32_i16 s8, s2 +; GFX6-NEXT: s_sext_i32_i16 s8, s6 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GFX6-NEXT: s_sext_i32_i16 s9, s0 +; GFX6-NEXT: s_sext_i32_i16 s9, s4 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 ; GFX6-NEXT: s_xor_b32 s8, s9, s8 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: s_ashr_i32 s2, s2, 16 +; GFX6-NEXT: s_ashr_i32 s6, s6, 16 ; GFX6-NEXT: s_ashr_i32 s8, s8, 30 ; GFX6-NEXT: s_or_b32 s8, s8, 1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 @@ -3837,70 +3835,70 @@ ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s2 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v3, s8 ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GFX6-NEXT: s_ashr_i32 s0, s0, 16 +; GFX6-NEXT: s_ashr_i32 s4, s4, 16 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 -; GFX6-NEXT: s_xor_b32 s0, s0, s2 -; GFX6-NEXT: s_ashr_i32 s0, s0, 30 -; GFX6-NEXT: s_or_b32 s0, s0, 1 +; GFX6-NEXT: s_xor_b32 s4, s4, s6 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 +; GFX6-NEXT: s_or_b32 s4, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 ; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 -; GFX6-NEXT: v_mov_b32_e32 v4, s0 -; GFX6-NEXT: s_sext_i32_i16 s0, s3 +; GFX6-NEXT: v_mov_b32_e32 v4, s4 +; GFX6-NEXT: s_sext_i32_i16 s4, s7 ; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc -; GFX6-NEXT: s_sext_i32_i16 s2, s1 +; GFX6-NEXT: s_sext_i32_i16 s6, s5 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v1, v3 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s2 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GFX6-NEXT: s_xor_b32 s0, s2, s0 -; GFX6-NEXT: s_ashr_i32 s0, s0, 30 -; GFX6-NEXT: s_or_b32 s0, s0, 1 +; GFX6-NEXT: s_xor_b32 s4, s6, s4 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 +; GFX6-NEXT: s_or_b32 s4, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v4, v1, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v1, -v4, v2, v1 -; GFX6-NEXT: v_mov_b32_e32 v5, s0 -; GFX6-NEXT: s_ashr_i32 s0, s3, 16 +; GFX6-NEXT: v_mov_b32_e32 v5, s4 +; GFX6-NEXT: s_ashr_i32 s4, s7, 16 ; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc -; GFX6-NEXT: s_ashr_i32 s1, s1, 16 +; GFX6-NEXT: s_ashr_i32 s5, s5, 16 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s1 +; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 -; GFX6-NEXT: s_xor_b32 s0, s1, s0 -; GFX6-NEXT: s_ashr_i32 s0, s0, 30 -; GFX6-NEXT: s_or_b32 s0, s0, 1 +; GFX6-NEXT: s_xor_b32 s4, s5, s4 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 +; GFX6-NEXT: s_or_b32 s4, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v5, v5 ; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4 ; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, s0 +; GFX6-NEXT: v_mov_b32_e32 v6, s4 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GFX6-NEXT: s_mov_b32 s0, 0xffff +; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s0, s6 @@ -3977,8 +3975,8 @@ ; ; GFX90A-LABEL: sdiv_v4i16: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_sext_i32_i16 s0, s6 @@ -4168,14 +4166,14 @@ ; ; GFX6-LABEL: srem_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_sext_i32_i16 s8, s2 +; GFX6-NEXT: s_sext_i32_i16 s8, s6 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GFX6-NEXT: s_sext_i32_i16 s9, s0 +; GFX6-NEXT: s_sext_i32_i16 s9, s4 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 ; GFX6-NEXT: s_xor_b32 s8, s9, s8 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 @@ -4189,14 +4187,14 @@ ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 -; GFX6-NEXT: s_ashr_i32 s2, s2, 16 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s2 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 +; GFX6-NEXT: s_ashr_i32 s6, s6, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s6 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GFX6-NEXT: s_ashr_i32 s4, s4, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 -; GFX6-NEXT: s_xor_b32 s8, s0, s2 +; GFX6-NEXT: s_xor_b32 s8, s4, s6 ; GFX6-NEXT: s_ashr_i32 s8, s8, 30 ; GFX6-NEXT: s_or_b32 s8, s8, 1 ; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 @@ -4207,58 +4205,58 @@ ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s2 -; GFX6-NEXT: s_sext_i32_i16 s2, s3 -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s2 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s0, v1 -; GFX6-NEXT: s_sext_i32_i16 s0, s1 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s0 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s6 +; GFX6-NEXT: s_sext_i32_i16 s6, s7 +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s6 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v1 +; GFX6-NEXT: s_sext_i32_i16 s4, s5 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GFX6-NEXT: s_xor_b32 s0, s0, s2 -; GFX6-NEXT: s_ashr_i32 s0, s0, 30 -; GFX6-NEXT: s_or_b32 s0, s0, 1 +; GFX6-NEXT: s_xor_b32 s4, s4, s6 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 +; GFX6-NEXT: s_or_b32 s4, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v4, v1, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v1, -v4, v2, v1 -; GFX6-NEXT: v_mov_b32_e32 v5, s0 -; GFX6-NEXT: s_ashr_i32 s0, s3, 16 +; GFX6-NEXT: v_mov_b32_e32 v5, s4 +; GFX6-NEXT: s_ashr_i32 s4, s7, 16 ; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc -; GFX6-NEXT: s_ashr_i32 s2, s1, 16 +; GFX6-NEXT: s_ashr_i32 s6, s5, 16 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s2 +; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s3 -; GFX6-NEXT: s_xor_b32 s3, s2, s0 -; GFX6-NEXT: s_ashr_i32 s3, s3, 30 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 +; GFX6-NEXT: s_xor_b32 s7, s6, s4 +; GFX6-NEXT: s_ashr_i32 s7, s7, 30 ; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v5, v5 ; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4 ; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX6-NEXT: s_or_b32 s3, s3, 1 -; GFX6-NEXT: v_mov_b32_e32 v6, s3 +; GFX6-NEXT: s_or_b32 s7, s7, 1 +; GFX6-NEXT: v_mov_b32_e32 v6, s7 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GFX6-NEXT: v_mul_lo_u32 v2, v2, s0 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 -; GFX6-NEXT: s_mov_b32 s0, 0xffff -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 +; GFX6-NEXT: v_mul_lo_u32 v2, v2, s4 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 +; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s0, s6 @@ -4343,8 +4341,8 @@ ; ; GFX90A-LABEL: srem_v4i16: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_sext_i32_i16 s0, s6 @@ -4455,16 +4453,16 @@ ; ; GFX6-LABEL: udiv_i3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s1, s0, 0x30008 -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, s1 +; GFX6-NEXT: s_bfe_u32 s2, s4, 0x30008 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GFX6-NEXT: s_and_b32 s0, s0, 7 -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 +; GFX6-NEXT: s_and_b32 s4, s4, 7 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 @@ -4472,13 +4470,13 @@ ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc ; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 -; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s0, s4, 0x30008 @@ -4498,8 +4496,8 @@ ; ; GFX90A-LABEL: udiv_i3: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_bfe_u32 s0, s4, 0x30008 @@ -4547,27 +4545,27 @@ ; ; GFX6-LABEL: urem_i3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s1, s0, 0x30008 -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, s1 +; GFX6-NEXT: s_bfe_u32 s2, s4, 0x30008 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GFX6-NEXT: s_and_b32 s2, s0, 7 -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 -; GFX6-NEXT: s_lshr_b32 s1, s0, 8 +; GFX6-NEXT: s_and_b32 s3, s4, 7 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s3 +; GFX6-NEXT: s_lshr_b32 s2, s4, 8 ; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 ; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s1 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 -; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_i3: @@ -4597,8 +4595,8 @@ ; ; GFX90A-LABEL: urem_i3: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_bfe_u32 s0, s4, 0x30008 @@ -4651,20 +4649,20 @@ ; ; GFX6-LABEL: sdiv_i3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s1, s0, 0x30008 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s1 -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x30000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s0 -; GFX6-NEXT: s_xor_b32 s0, s0, s1 +; GFX6-NEXT: s_bfe_i32 s5, s4, 0x30008 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 +; GFX6-NEXT: s_bfe_i32 s4, s4, 0x30000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GFX6-NEXT: s_xor_b32 s4, s4, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: s_ashr_i32 s0, s0, 30 -; GFX6-NEXT: s_or_b32 s0, s0, 1 -; GFX6-NEXT: v_mov_b32_e32 v3, s0 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 +; GFX6-NEXT: s_or_b32 s4, s4, 1 +; GFX6-NEXT: v_mov_b32_e32 v3, s4 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 @@ -4673,13 +4671,13 @@ ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 -; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s0, s4, 0x30008 @@ -4704,8 +4702,8 @@ ; ; GFX90A-LABEL: sdiv_i3: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_bfe_i32 s0, s4, 0x30008 @@ -4762,32 +4760,32 @@ ; ; GFX6-LABEL: srem_i3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s1, s0, 0x30008 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s1 -; GFX6-NEXT: s_bfe_i32 s3, s0, 0x30000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s3 -; GFX6-NEXT: s_xor_b32 s1, s3, s1 +; GFX6-NEXT: s_bfe_i32 s2, s4, 0x30008 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX6-NEXT: s_bfe_i32 s5, s4, 0x30000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 +; GFX6-NEXT: s_xor_b32 s2, s5, s2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: s_ashr_i32 s1, s1, 30 -; GFX6-NEXT: s_or_b32 s1, s1, 1 -; GFX6-NEXT: v_mov_b32_e32 v3, s1 +; GFX6-NEXT: s_ashr_i32 s2, s2, 30 +; GFX6-NEXT: s_or_b32 s2, s2, 1 +; GFX6-NEXT: v_mov_b32_e32 v3, s2 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GFX6-NEXT: s_lshr_b32 s2, s0, 8 +; GFX6-NEXT: s_lshr_b32 s3, s4, 8 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 -; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_i3: @@ -4822,8 +4820,8 @@ ; ; GFX90A-LABEL: srem_i3: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_bfe_i32 s0, s4, 0x30008 @@ -4920,20 +4918,20 @@ ; ; GFX6-LABEL: udiv_v3i16: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; GFX6-NEXT: s_mov_b32 s8, 0xffff ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s6, s0, s8 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX6-NEXT: s_and_b32 s6, s2, s8 -; GFX6-NEXT: s_lshr_b32 s0, s0, 16 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX6-NEXT: s_and_b32 s6, s0, s8 +; GFX6-NEXT: s_lshr_b32 s2, s2, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s0 -; GFX6-NEXT: s_lshr_b32 s0, s2, 16 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s2 +; GFX6-NEXT: s_lshr_b32 s0, s0, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s0 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 @@ -4943,11 +4941,11 @@ ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: s_and_b32 s0, s1, s8 +; GFX6-NEXT: s_and_b32 s0, s3, s8 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc ; GFX6-NEXT: v_mad_f32 v2, -v1, v3, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s0 -; GFX6-NEXT: s_and_b32 s0, s3, s8 +; GFX6-NEXT: s_and_b32 s0, s1, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 @@ -4969,21 +4967,21 @@ ; ; GFX9-LABEL: udiv_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s8, 0xffff +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c +; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s6, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX9-NEXT: s_and_b32 s0, s4, s8 -; GFX9-NEXT: s_lshr_b32 s1, s6, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s0 +; GFX9-NEXT: s_and_b32 s1, s2, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 +; GFX9-NEXT: s_and_b32 s1, s6, s0 +; GFX9-NEXT: s_lshr_b32 s2, s2, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s1 -; GFX9-NEXT: s_lshr_b32 s0, s4, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 +; GFX9-NEXT: s_lshr_b32 s1, s6, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s1 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 @@ -4992,11 +4990,11 @@ ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 ; GFX9-NEXT: v_mul_f32_e32 v2, v5, v6 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 -; GFX9-NEXT: s_and_b32 s0, s7, s8 +; GFX9-NEXT: s_and_b32 s1, s3, s0 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc ; GFX9-NEXT: v_mad_f32 v3, -v2, v4, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 -; GFX9-NEXT: s_and_b32 s0, s5, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s1 +; GFX9-NEXT: s_and_b32 s0, s7, s0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 @@ -5010,27 +5008,27 @@ ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: global_store_short v1, v3, s[2:3] offset:4 -; GFX9-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-NEXT: global_store_short v1, v3, s[4:5] offset:4 +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: udiv_v3i16: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; GFX90A-NEXT: s_mov_b32 s8, 0xffff +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c +; GFX90A-NEXT: s_mov_b32 s0, 0xffff ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_and_b32 s0, s6, s8 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX90A-NEXT: s_and_b32 s0, s4, s8 -; GFX90A-NEXT: s_lshr_b32 s1, s6, 16 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s0 +; GFX90A-NEXT: s_and_b32 s1, s2, s0 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s1 +; GFX90A-NEXT: s_and_b32 s1, s6, s0 +; GFX90A-NEXT: s_lshr_b32 s2, s2, 16 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s1 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s1 -; GFX90A-NEXT: s_lshr_b32 s0, s4, 16 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s0 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s2 +; GFX90A-NEXT: s_lshr_b32 s1, s6, 16 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s1 ; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v4 ; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 @@ -5039,11 +5037,11 @@ ; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 ; GFX90A-NEXT: v_mul_f32_e32 v2, v5, v6 ; GFX90A-NEXT: v_trunc_f32_e32 v2, v2 -; GFX90A-NEXT: s_and_b32 s0, s7, s8 +; GFX90A-NEXT: s_and_b32 s1, s3, s0 ; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc ; GFX90A-NEXT: v_mad_f32 v3, -v2, v4, v5 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s0 -; GFX90A-NEXT: s_and_b32 s0, s5, s8 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s1 +; GFX90A-NEXT: s_and_b32 s0, s7, s0 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v5 @@ -5057,8 +5055,8 @@ ; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 ; GFX90A-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX90A-NEXT: global_store_short v1, v3, s[2:3] offset:4 -; GFX90A-NEXT: global_store_dword v1, v0, s[2:3] +; GFX90A-NEXT: global_store_short v1, v3, s[4:5] offset:4 +; GFX90A-NEXT: global_store_dword v1, v0, s[4:5] ; GFX90A-NEXT: s_endpgm %r = udiv <3 x i16> %x, %y store <3 x i16> %r, <3 x i16> addrspace(1)* %out @@ -5138,22 +5136,21 @@ ; ; GFX6-LABEL: urem_v3i16: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; GFX6-NEXT: s_mov_b32 s8, 0xffff ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_and_b32 s6, s0, s8 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX6-NEXT: s_and_b32 s6, s2, s8 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX6-NEXT: s_and_b32 s6, s0, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 -; GFX6-NEXT: v_mov_b32_e32 v4, s0 +; GFX6-NEXT: v_mov_b32_e32 v4, s2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX6-NEXT: v_alignbit_b32 v4, s1, v4, 16 +; GFX6-NEXT: v_alignbit_b32 v4, s3, v4, 16 ; GFX6-NEXT: v_and_b32_e32 v5, s8, v4 -; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 @@ -5161,34 +5158,35 @@ ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, v5 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 +; GFX6-NEXT: v_alignbit_b32 v1, s1, v1, 16 ; GFX6-NEXT: v_and_b32_e32 v3, s8, v1 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s0 ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, v3 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 -; GFX6-NEXT: s_and_b32 s0, s1, s8 -; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s0 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: s_and_b32 s0, s3, s8 +; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s0 ; GFX6-NEXT: v_mul_f32_e32 v5, v3, v5 +; GFX6-NEXT: s_and_b32 s0, s1, s8 ; GFX6-NEXT: v_trunc_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v6 ; GFX6-NEXT: v_mad_f32 v3, -v5, v2, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, v7, v8 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v2, v4 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v3 ; GFX6-NEXT: v_mad_f32 v3, -v3, v6, v7 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v6 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v3, v3, s1 +; GFX6-NEXT: v_mul_lo_u32 v3, v3, s3 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s3, v3 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s1, v3 ; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 @@ -5197,108 +5195,108 @@ ; ; GFX9-LABEL: urem_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s8, 0xffff +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s4, s8 -; GFX9-NEXT: s_and_b32 s1, s6, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s6, s6, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX9-NEXT: s_and_b32 s1, s2, s0 +; GFX9-NEXT: s_and_b32 s8, s4, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s1 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX9-NEXT: s_lshr_b32 s2, s2, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v2 ; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 -; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v3 ; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, v4, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s1 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: s_and_b32 s1, s7, s8 +; GFX9-NEXT: s_and_b32 s5, s5, s0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v3 ; GFX9-NEXT: v_mad_f32 v3, -v1, v2, v4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s1 -; GFX9-NEXT: s_and_b32 s5, s5, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s5 -; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s5 +; GFX9-NEXT: s_and_b32 s0, s3, s0 +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 -; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s8 ; GFX9-NEXT: v_mul_f32_e32 v2, v5, v6 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_mad_f32 v2, -v2, v4, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s1 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, s4 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s5 +; GFX9-NEXT: v_sub_u32_e32 v0, s1, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_sub_u32_e32 v1, s4, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s2, v1 +; GFX9-NEXT: v_sub_u32_e32 v2, s0, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-NEXT: global_store_short v3, v2, s[2:3] offset:4 -; GFX9-NEXT: global_store_dword v3, v0, s[2:3] +; GFX9-NEXT: global_store_short v3, v2, s[6:7] offset:4 +; GFX9-NEXT: global_store_dword v3, v0, s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: urem_v3i16: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; GFX90A-NEXT: s_mov_b32 s8, 0xffff +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c +; GFX90A-NEXT: s_mov_b32 s0, 0xffff ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_and_b32 s1, s4, s8 -; GFX90A-NEXT: s_and_b32 s0, s6, s8 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s1 -; GFX90A-NEXT: s_lshr_b32 s6, s6, 16 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s6 +; GFX90A-NEXT: s_and_b32 s1, s2, s0 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s1 +; GFX90A-NEXT: s_and_b32 s8, s6, s0 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s8 +; GFX90A-NEXT: s_lshr_b32 s2, s2, 16 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s4 -; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s2 +; GFX90A-NEXT: s_lshr_b32 s6, s6, 16 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s6 ; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 ; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v4 ; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc ; GFX90A-NEXT: v_mul_f32_e32 v2, v5, v6 +; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s1 ; GFX90A-NEXT: v_trunc_f32_e32 v2, v2 -; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc -; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s0 -; GFX90A-NEXT: s_and_b32 s0, s7, s8 +; GFX90A-NEXT: s_and_b32 s1, s3, s0 ; GFX90A-NEXT: v_mad_f32 v3, -v2, v4, v5 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s0 -; GFX90A-NEXT: v_sub_u32_e32 v0, s1, v0 -; GFX90A-NEXT: s_and_b32 s1, s5, s8 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s1 -; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v5 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s1 +; GFX90A-NEXT: s_and_b32 s0, s7, s0 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v5 ; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 -; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX90A-NEXT: v_sub_u32_e32 v0, s8, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX90A-NEXT: v_mul_f32_e32 v3, v6, v7 ; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v3 -; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX90A-NEXT: v_mad_f32 v3, -v3, v5, v6 ; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 +; GFX90A-NEXT: v_mul_lo_u32 v2, v2, s2 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX90A-NEXT: v_mul_lo_u32 v2, v2, s6 -; GFX90A-NEXT: v_mul_lo_u32 v3, v3, s0 -; GFX90A-NEXT: v_sub_u32_e32 v2, s4, v2 -; GFX90A-NEXT: v_sub_u32_e32 v3, s1, v3 +; GFX90A-NEXT: v_mul_lo_u32 v3, v3, s1 +; GFX90A-NEXT: v_sub_u32_e32 v2, s6, v2 +; GFX90A-NEXT: v_sub_u32_e32 v3, s0, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX90A-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX90A-NEXT: global_store_short v1, v3, s[2:3] offset:4 -; GFX90A-NEXT: global_store_dword v1, v0, s[2:3] +; GFX90A-NEXT: global_store_short v1, v3, s[4:5] offset:4 +; GFX90A-NEXT: global_store_dword v1, v0, s[4:5] ; GFX90A-NEXT: s_endpgm %r = urem <3 x i16> %x, %y store <3 x i16> %r, <3 x i16> addrspace(1)* %out @@ -5384,46 +5382,46 @@ ; ; GFX6-LABEL: sdiv_v3i16: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_sext_i32_i16 s9, s2 -; GFX6-NEXT: s_sext_i32_i16 s8, s0 +; GFX6-NEXT: s_sext_i32_i16 s8, s2 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GFX6-NEXT: s_sext_i32_i16 s9, s0 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 ; GFX6-NEXT: s_xor_b32 s8, s9, s8 -; GFX6-NEXT: s_ashr_i32 s0, s0, 16 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: s_ashr_i32 s2, s2, 16 ; GFX6-NEXT: s_ashr_i32 s8, s8, 30 ; GFX6-NEXT: s_or_b32 s8, s8, 1 -; GFX6-NEXT: v_mov_b32_e32 v3, s8 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s0 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v3, s8 ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GFX6-NEXT: s_ashr_i32 s2, s2, 16 +; GFX6-NEXT: s_ashr_i32 s0, s0, 16 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s2 +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 -; GFX6-NEXT: s_xor_b32 s0, s2, s0 +; GFX6-NEXT: s_xor_b32 s0, s0, s2 ; GFX6-NEXT: s_ashr_i32 s0, s0, 30 ; GFX6-NEXT: s_or_b32 s0, s0, 1 ; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 ; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 ; GFX6-NEXT: v_mov_b32_e32 v4, s0 -; GFX6-NEXT: s_sext_i32_i16 s0, s1 +; GFX6-NEXT: s_sext_i32_i16 s0, s3 ; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc -; GFX6-NEXT: s_sext_i32_i16 s1, s3 +; GFX6-NEXT: s_sext_i32_i16 s1, s1 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 @@ -5447,18 +5445,18 @@ ; ; GFX9-LABEL: sdiv_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s1, s4 -; GFX9-NEXT: s_sext_i32_i16 s0, s6 +; GFX9-NEXT: s_sext_i32_i16 s0, s2 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX9-NEXT: s_sext_i32_i16 s1, s4 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 ; GFX9-NEXT: s_xor_b32 s0, s1, s0 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: s_or_b32 s8, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 @@ -5466,60 +5464,60 @@ ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cselect_b32 s0, s8, 0 -; GFX9-NEXT: s_ashr_i32 s1, s6, 16 +; GFX9-NEXT: s_ashr_i32 s1, s2, 16 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 +; GFX9-NEXT: s_ashr_i32 s2, s4, 16 ; GFX9-NEXT: v_add_u32_e32 v2, s0, v3 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: s_xor_b32 s0, s4, s1 +; GFX9-NEXT: s_xor_b32 s0, s2, s1 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s4, s0, 1 +; GFX9-NEXT: s_or_b32 s2, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v0| ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: s_sext_i32_i16 s1, s7 +; GFX9-NEXT: s_sext_i32_i16 s1, s3 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 -; GFX9-NEXT: s_cselect_b32 s0, s4, 0 +; GFX9-NEXT: s_cselect_b32 s0, s2, 0 ; GFX9-NEXT: v_add_u32_e32 v3, s0, v4 ; GFX9-NEXT: s_sext_i32_i16 s0, s5 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 ; GFX9-NEXT: s_xor_b32 s0, s0, s1 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s4, s0, 1 +; GFX9-NEXT: s_or_b32 s2, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v0, v4 ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v0| ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s4, 0 +; GFX9-NEXT: s_cselect_b32 s0, s2, 0 ; GFX9-NEXT: v_add_u32_e32 v0, s0, v5 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX9-NEXT: global_store_short v1, v0, s[2:3] offset:4 -; GFX9-NEXT: global_store_dword v1, v2, s[2:3] +; GFX9-NEXT: global_store_short v1, v0, s[6:7] offset:4 +; GFX9-NEXT: global_store_dword v1, v2, s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: sdiv_v3i16: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_sext_i32_i16 s1, s4 -; GFX90A-NEXT: s_sext_i32_i16 s0, s6 +; GFX90A-NEXT: s_sext_i32_i16 s0, s2 ; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX90A-NEXT: s_sext_i32_i16 s1, s4 ; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s1 ; GFX90A-NEXT: s_xor_b32 s0, s1, s0 -; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 ; GFX90A-NEXT: s_or_b32 s8, s0, 1 ; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 @@ -5527,44 +5525,44 @@ ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| ; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX90A-NEXT: s_cselect_b32 s0, s8, 0 -; GFX90A-NEXT: s_ashr_i32 s1, s6, 16 +; GFX90A-NEXT: s_ashr_i32 s1, s2, 16 ; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s1 -; GFX90A-NEXT: s_ashr_i32 s4, s4, 16 +; GFX90A-NEXT: s_ashr_i32 s2, s4, 16 ; GFX90A-NEXT: v_add_u32_e32 v2, s0, v3 -; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s4 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s2 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX90A-NEXT: s_xor_b32 s0, s4, s1 +; GFX90A-NEXT: s_xor_b32 s0, s2, s1 ; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 -; GFX90A-NEXT: s_or_b32 s4, s0, 1 +; GFX90A-NEXT: s_or_b32 s2, s0, 1 ; GFX90A-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX90A-NEXT: v_trunc_f32_e32 v4, v4 ; GFX90A-NEXT: v_mad_f32 v3, -v4, v0, v3 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v0| ; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX90A-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX90A-NEXT: s_sext_i32_i16 s1, s7 +; GFX90A-NEXT: s_sext_i32_i16 s1, s3 ; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s1 -; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 +; GFX90A-NEXT: s_cselect_b32 s0, s2, 0 ; GFX90A-NEXT: v_add_u32_e32 v3, s0, v4 ; GFX90A-NEXT: s_sext_i32_i16 s0, s5 ; GFX90A-NEXT: v_cvt_f32_i32_e32 v4, s0 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v0 ; GFX90A-NEXT: s_xor_b32 s0, s0, s1 ; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 -; GFX90A-NEXT: s_or_b32 s4, s0, 1 +; GFX90A-NEXT: s_or_b32 s2, s0, 1 ; GFX90A-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 ; GFX90A-NEXT: v_mad_f32 v4, -v5, v0, v4 ; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v0| ; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 +; GFX90A-NEXT: s_cselect_b32 s0, s2, 0 ; GFX90A-NEXT: v_add_u32_e32 v0, s0, v5 ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX90A-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX90A-NEXT: global_store_short v1, v0, s[2:3] offset:4 -; GFX90A-NEXT: global_store_dword v1, v2, s[2:3] +; GFX90A-NEXT: global_store_short v1, v0, s[6:7] offset:4 +; GFX90A-NEXT: global_store_dword v1, v2, s[6:7] ; GFX90A-NEXT: s_endpgm %r = sdiv <3 x i16> %x, %y store <3 x i16> %r, <3 x i16> addrspace(1)* %out @@ -5656,157 +5654,158 @@ ; ; GFX6-LABEL: srem_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_sext_i32_i16 s8, s2 -; GFX6-NEXT: s_sext_i32_i16 s6, s0 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6 +; GFX6-NEXT: s_sext_i32_i16 s2, s4 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX6-NEXT: s_sext_i32_i16 s8, s6 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s8 -; GFX6-NEXT: s_xor_b32 s6, s8, s6 -; GFX6-NEXT: s_ashr_i32 s6, s6, 30 +; GFX6-NEXT: s_xor_b32 s2, s8, s2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: s_or_b32 s6, s6, 1 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_ashr_i32 s2, s2, 30 +; GFX6-NEXT: s_or_b32 s2, s2, 1 +; GFX6-NEXT: v_mov_b32_e32 v3, s2 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 16 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_alignbit_b32 v2, s5, v2, 16 ; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v3 -; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16 +; GFX6-NEXT: v_alignbit_b32 v1, s7, v1, 16 ; GFX6-NEXT: v_bfe_i32 v5, v1, 0, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v6, v5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s0 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4 ; GFX6-NEXT: v_xor_b32_e32 v3, v5, v3 -; GFX6-NEXT: s_sext_i32_i16 s0, s1 +; GFX6-NEXT: s_sext_i32_i16 s4, s5 ; GFX6-NEXT: v_mul_f32_e32 v5, v6, v7 ; GFX6-NEXT: v_trunc_f32_e32 v5, v5 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GFX6-NEXT: v_mad_f32 v6, -v5, v4, v6 ; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 30, v3 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v4| -; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s0 +; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s4 ; GFX6-NEXT: v_or_b32_e32 v3, 1, v3 ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GFX6-NEXT: s_sext_i32_i16 s2, s3 +; GFX6-NEXT: s_sext_i32_i16 s6, s7 ; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 -; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s2 +; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v4 -; GFX6-NEXT: s_xor_b32 s0, s2, s0 -; GFX6-NEXT: s_ashr_i32 s0, s0, 30 -; GFX6-NEXT: s_or_b32 s0, s0, 1 +; GFX6-NEXT: s_xor_b32 s4, s6, s4 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 +; GFX6-NEXT: s_or_b32 s4, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v5, v3, v5 ; GFX6-NEXT: v_trunc_f32_e32 v5, v5 ; GFX6-NEXT: v_mad_f32 v3, -v5, v4, v3 ; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, s0 +; GFX6-NEXT: v_mov_b32_e32 v6, s4 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v4| ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GFX6-NEXT: v_mul_lo_u32 v3, v3, s1 +; GFX6-NEXT: v_mul_lo_u32 v3, v3, s5 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s7, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s3, v3 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_v3i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s8, s2 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GFX9-NEXT: s_sext_i32_i16 s9, s6 +; GFX9-NEXT: s_sext_i32_i16 s9, s4 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s9 -; GFX9-NEXT: s_xor_b32 s0, s9, s8 +; GFX9-NEXT: s_xor_b32 s6, s9, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s10, s0, 1 -; GFX9-NEXT: s_sext_i32_i16 s3, s3 +; GFX9-NEXT: s_ashr_i32 s6, s6, 30 +; GFX9-NEXT: s_or_b32 s10, s6, 1 +; GFX9-NEXT: s_sext_i32_i16 s5, s5 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s10, 0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GFX9-NEXT: s_cselect_b32 s6, s10, 0 ; GFX9-NEXT: s_ashr_i32 s2, s2, 16 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: s_ashr_i32 s6, s6, 16 -; GFX9-NEXT: v_add_u32_e32 v1, s0, v2 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s6 +; GFX9-NEXT: s_ashr_i32 s4, s4, 16 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_add_u32_e32 v1, s6, v2 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_xor_b32 s0, s6, s2 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_xor_b32 s6, s4, s2 +; GFX9-NEXT: s_ashr_i32 s6, s6, 30 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: s_or_b32 s8, s0, 1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s3 -; GFX9-NEXT: s_cselect_b32 s0, s8, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 +; GFX9-NEXT: s_or_b32 s8, s6, 1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v2|, |v0| +; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GFX9-NEXT: s_cselect_b32 s6, s8, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s6, v3 +; GFX9-NEXT: s_sext_i32_i16 s6, s3 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s6 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s5 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 -; GFX9-NEXT: s_sext_i32_i16 s2, s7 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s2 +; GFX9-NEXT: s_xor_b32 s2, s5, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GFX9-NEXT: s_xor_b32 s0, s2, s3 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s7, s0, 1 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_or_b32 s7, s2, 1 +; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: v_mad_f32 v3, -v4, v2, v3 ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s7, 0 -; GFX9-NEXT: v_add_u32_e32 v2, s0, v4 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3 -; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v3|, |v2| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s7, 0 +; GFX9-NEXT: v_add_u32_e32 v2, s2, v4 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_sub_u32_e32 v0, s6, v0 -; GFX9-NEXT: v_sub_u32_e32 v2, s2, v2 +; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 -; GFX9-NEXT: global_store_short v3, v2, s[4:5] offset:4 -; GFX9-NEXT: global_store_dword v3, v0, s[4:5] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_short v3, v2, s[0:1] offset:4 +; GFX9-NEXT: global_store_dword v3, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: srem_v3i16: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_sext_i32_i16 s9, s4 -; GFX90A-NEXT: s_sext_i32_i16 s8, s6 +; GFX90A-NEXT: s_sext_i32_i16 s8, s2 ; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GFX90A-NEXT: s_sext_i32_i16 s9, s4 ; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s9 ; GFX90A-NEXT: s_xor_b32 s0, s9, s8 -; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 ; GFX90A-NEXT: s_or_b32 s10, s0, 1 ; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 @@ -5814,14 +5813,14 @@ ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| ; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX90A-NEXT: s_cselect_b32 s0, s10, 0 -; GFX90A-NEXT: s_ashr_i32 s6, s6, 16 +; GFX90A-NEXT: s_ashr_i32 s2, s2, 16 ; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s6 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s2 ; GFX90A-NEXT: s_ashr_i32 s4, s4, 16 ; GFX90A-NEXT: v_add_u32_e32 v0, s0, v3 ; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s4 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GFX90A-NEXT: s_xor_b32 s0, s4, s6 +; GFX90A-NEXT: s_xor_b32 s0, s4, s2 ; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 ; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s8 ; GFX90A-NEXT: v_mul_f32_e32 v4, v3, v4 @@ -5833,31 +5832,31 @@ ; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX90A-NEXT: s_cselect_b32 s0, s8, 0 ; GFX90A-NEXT: v_add_u32_e32 v2, s0, v4 -; GFX90A-NEXT: v_mul_lo_u32 v2, v2, s6 -; GFX90A-NEXT: s_sext_i32_i16 s6, s7 -; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s6 -; GFX90A-NEXT: v_sub_u32_e32 v2, s4, v2 -; GFX90A-NEXT: s_sext_i32_i16 s4, s5 -; GFX90A-NEXT: v_cvt_f32_i32_e32 v4, s4 +; GFX90A-NEXT: v_mul_lo_u32 v2, v2, s2 +; GFX90A-NEXT: s_sext_i32_i16 s2, s3 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s2 +; GFX90A-NEXT: s_sext_i32_i16 s3, s5 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v4, s3 +; GFX90A-NEXT: s_xor_b32 s0, s3, s2 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v3 -; GFX90A-NEXT: s_xor_b32 s0, s4, s6 ; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 -; GFX90A-NEXT: s_or_b32 s5, s0, 1 +; GFX90A-NEXT: v_sub_u32_e32 v2, s4, v2 +; GFX90A-NEXT: s_or_b32 s4, s0, 1 ; GFX90A-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 ; GFX90A-NEXT: v_mad_f32 v4, -v5, v3, v4 ; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| ; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX90A-NEXT: s_cselect_b32 s0, s5, 0 +; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 ; GFX90A-NEXT: v_add_u32_e32 v3, s0, v5 ; GFX90A-NEXT: v_sub_u32_e32 v0, s9, v0 -; GFX90A-NEXT: v_mul_lo_u32 v3, v3, s6 -; GFX90A-NEXT: v_sub_u32_e32 v3, s4, v3 +; GFX90A-NEXT: v_mul_lo_u32 v3, v3, s2 +; GFX90A-NEXT: v_sub_u32_e32 v3, s3, v3 ; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX90A-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX90A-NEXT: global_store_short v1, v3, s[2:3] offset:4 -; GFX90A-NEXT: global_store_dword v1, v0, s[2:3] +; GFX90A-NEXT: global_store_short v1, v3, s[6:7] offset:4 +; GFX90A-NEXT: global_store_dword v1, v0, s[6:7] ; GFX90A-NEXT: s_endpgm %r = srem <3 x i16> %x, %y store <3 x i16> %r, <3 x i16> addrspace(1)* %out @@ -5931,8 +5930,8 @@ ; ; GFX6-LABEL: udiv_v3i15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 @@ -5989,35 +5988,35 @@ ; ; GFX9-LABEL: udiv_v3i15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; GFX9-NEXT: s_movk_i32 s8, 0x7fff +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s4, s8 -; GFX9-NEXT: s_and_b32 s1, s6, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0 -; GFX9-NEXT: s_bfe_u32 s0, s6, 0xf000f -; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 +; GFX9-NEXT: s_and_b32 s3, s2, s6 +; GFX9-NEXT: s_and_b32 s7, s0, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0xf000f +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 -; GFX9-NEXT: s_bfe_u32 s1, s4, 0xf000f -; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: v_alignbit_b32 v3, s7, v3, 30 +; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0xf000f +; GFX9-NEXT: v_alignbit_b32 v3, s1, v3, 30 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 -; GFX9-NEXT: v_and_b32_e32 v3, s8, v3 +; GFX9-NEXT: v_and_b32_e32 v3, s6, v3 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4 ; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v3 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 30 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, v7, v8 -; GFX9-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX9-NEXT: v_and_b32_e32 v0, s6, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc ; GFX9-NEXT: v_mad_f32 v5, -v1, v6, v7 @@ -6031,49 +6030,49 @@ ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v1 ; GFX9-NEXT: v_mad_f32 v0, -v1, v3, v0 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v3 -; GFX9-NEXT: v_and_b32_e32 v3, s8, v4 +; GFX9-NEXT: v_and_b32_e32 v3, s6, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc -; GFX9-NEXT: v_and_b32_e32 v4, s8, v5 +; GFX9-NEXT: v_and_b32_e32 v4, s6, v5 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[2:3] +; GFX9-NEXT: global_store_dword v2, v0, s[4:5] ; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX9-NEXT: global_store_short v2, v0, s[2:3] offset:4 +; GFX9-NEXT: global_store_short v2, v0, s[4:5] offset:4 ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: udiv_v3i15: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX90A-NEXT: s_movk_i32 s6, 0x7fff +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_and_b32 s0, s4, s8 -; GFX90A-NEXT: s_and_b32 s1, s6, s8 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s1 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s0 -; GFX90A-NEXT: s_bfe_u32 s0, s6, 0xf000f -; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NEXT: v_alignbit_b32 v0, s3, v0, 30 +; GFX90A-NEXT: s_and_b32 s3, s2, s6 +; GFX90A-NEXT: s_and_b32 s7, s0, s6 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX90A-NEXT: v_mov_b32_e32 v3, s0 +; GFX90A-NEXT: s_bfe_u32 s0, s0, 0xf000f +; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s3 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v1 -; GFX90A-NEXT: s_bfe_u32 s1, s4, 0xf000f -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 -; GFX90A-NEXT: v_alignbit_b32 v3, s7, v3, 30 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s0 +; GFX90A-NEXT: s_bfe_u32 s2, s2, 0xf000f +; GFX90A-NEXT: v_alignbit_b32 v3, s1, v3, 30 ; GFX90A-NEXT: v_mul_f32_e32 v5, v4, v5 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v7, s1 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v7, s2 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v8, v6 -; GFX90A-NEXT: v_and_b32_e32 v3, s8, v3 +; GFX90A-NEXT: v_and_b32_e32 v3, s6, v3 ; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 ; GFX90A-NEXT: v_mad_f32 v4, -v5, v1, v4 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, v3 -; GFX90A-NEXT: v_mov_b32_e32 v0, s4 -; GFX90A-NEXT: v_alignbit_b32 v0, s5, v0, 30 ; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 ; GFX90A-NEXT: v_mul_f32_e32 v1, v7, v8 -; GFX90A-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, s6, v0 ; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc ; GFX90A-NEXT: v_mad_f32 v5, -v1, v6, v7 @@ -6087,16 +6086,16 @@ ; GFX90A-NEXT: v_cvt_u32_f32_e32 v6, v1 ; GFX90A-NEXT: v_mad_f32 v0, -v1, v3, v0 ; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v3 -; GFX90A-NEXT: v_and_b32_e32 v3, s8, v4 -; GFX90A-NEXT: v_and_b32_e32 v4, s8, v5 +; GFX90A-NEXT: v_and_b32_e32 v3, s6, v4 +; GFX90A-NEXT: v_and_b32_e32 v4, s6, v5 ; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 15, v4 ; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] ; GFX90A-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX90A-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX90A-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NEXT: global_store_dword v2, v0, s[4:5] ; GFX90A-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX90A-NEXT: global_store_short v2, v0, s[2:3] offset:4 +; GFX90A-NEXT: global_store_short v2, v0, s[4:5] offset:4 ; GFX90A-NEXT: s_endpgm %r = udiv <3 x i15> %x, %y store <3 x i15> %r, <3 x i15> addrspace(1)* %out @@ -6176,8 +6175,8 @@ ; ; GFX6-LABEL: urem_v3i15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 @@ -6242,35 +6241,35 @@ ; ; GFX9-LABEL: urem_v3i15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; GFX9-NEXT: s_movk_i32 s8, 0x7fff +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 30 -; GFX9-NEXT: s_and_b32 s5, s6, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 -; GFX9-NEXT: s_and_b32 s0, s4, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0 -; GFX9-NEXT: s_bfe_u32 s5, s6, 0xf000f +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 +; GFX9-NEXT: s_and_b32 s3, s2, s6 +; GFX9-NEXT: s_and_b32 s8, s0, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s3 +; GFX9-NEXT: s_bfe_u32 s3, s0, 0xf000f +; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: v_alignbit_b32 v3, s7, v3, 30 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_alignbit_b32 v3, s1, v3, 30 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0xf000f ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4 ; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX9-NEXT: s_bfe_u32 s1, s4, 0xf000f -; GFX9-NEXT: v_and_b32_e32 v3, s8, v3 +; GFX9-NEXT: v_and_b32_e32 v3, s6, v3 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3 -; GFX9-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX9-NEXT: v_and_b32_e32 v0, s6, v0 ; GFX9-NEXT: v_mul_f32_e32 v4, v7, v8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v5 @@ -6283,72 +6282,72 @@ ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: v_mad_f32 v6, -v6, v5, v8 -; GFX9-NEXT: s_lshr_b32 s0, s6, 15 +; GFX9-NEXT: s_lshr_b32 s1, s0, 15 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 -; GFX9-NEXT: v_mul_lo_u32 v4, v4, s0 +; GFX9-NEXT: v_mul_lo_u32 v4, v4, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, s0 ; GFX9-NEXT: v_mul_lo_u32 v3, v5, v3 -; GFX9-NEXT: s_lshr_b32 s0, s4, 15 +; GFX9-NEXT: s_lshr_b32 s0, s2, 15 ; GFX9-NEXT: v_sub_u32_e32 v4, s0, v4 -; GFX9-NEXT: v_sub_u32_e32 v5, s4, v1 +; GFX9-NEXT: v_sub_u32_e32 v5, s2, v1 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_and_b32_e32 v4, s8, v4 +; GFX9-NEXT: v_and_b32_e32 v4, s6, v4 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] -; GFX9-NEXT: v_and_b32_e32 v3, s8, v5 +; GFX9-NEXT: v_and_b32_e32 v3, s6, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[2:3] +; GFX9-NEXT: global_store_dword v2, v0, s[4:5] ; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX9-NEXT: global_store_short v2, v0, s[2:3] offset:4 +; GFX9-NEXT: global_store_short v2, v0, s[4:5] offset:4 ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: urem_v3i15: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX90A-NEXT: s_movk_i32 s6, 0x7fff +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_and_b32 s1, s4, s8 -; GFX90A-NEXT: s_and_b32 s9, s6, s8 +; GFX90A-NEXT: s_and_b32 s7, s2, s6 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s7 +; GFX90A-NEXT: s_bfe_u32 s8, s2, 0xf000f +; GFX90A-NEXT: s_and_b32 s9, s0, s6 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 -; GFX90A-NEXT: v_alignbit_b32 v3, s7, v3, 30 -; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v1 -; GFX90A-NEXT: s_bfe_u32 s7, s6, 0xf000f +; GFX90A-NEXT: s_bfe_u32 s7, s0, 0xf000f ; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s7 -; GFX90A-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v7, s8 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s0 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v8, v6 +; GFX90A-NEXT: v_alignbit_b32 v3, s1, v3, 30 ; GFX90A-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 ; GFX90A-NEXT: v_mad_f32 v4, -v5, v1, v4 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX90A-NEXT: v_alignbit_b32 v0, s5, v0, 30 -; GFX90A-NEXT: s_bfe_u32 s5, s4, 0xf000f -; GFX90A-NEXT: v_cvt_f32_u32_e32 v7, s5 -; GFX90A-NEXT: v_rcp_iflag_f32_e32 v8, v6 ; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 +; GFX90A-NEXT: v_and_b32_e32 v3, s6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, s2 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc -; GFX90A-NEXT: v_and_b32_e32 v3, s8, v3 -; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s6 -; GFX90A-NEXT: v_sub_u32_e32 v4, s4, v1 +; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s0 +; GFX90A-NEXT: v_sub_u32_e32 v4, s2, v1 ; GFX90A-NEXT: v_mul_f32_e32 v1, v7, v8 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, v3 ; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 +; GFX90A-NEXT: v_alignbit_b32 v0, s3, v0, 30 ; GFX90A-NEXT: v_mad_f32 v7, -v1, v6, v7 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX90A-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, s6, v0 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v8, v0 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v9, v5 ; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, v6 -; GFX90A-NEXT: s_lshr_b32 s1, s6, 15 +; GFX90A-NEXT: s_lshr_b32 s1, s0, 15 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: s_lshr_b32 s0, s4, 15 +; GFX90A-NEXT: s_lshr_b32 s3, s2, 15 ; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s1 -; GFX90A-NEXT: v_sub_u32_e32 v6, s0, v1 +; GFX90A-NEXT: v_sub_u32_e32 v6, s3, v1 ; GFX90A-NEXT: v_mul_f32_e32 v1, v8, v9 ; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v7, v1 @@ -6356,16 +6355,16 @@ ; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v5 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc ; GFX90A-NEXT: v_mul_lo_u32 v1, v1, v3 -; GFX90A-NEXT: v_and_b32_e32 v3, s8, v4 -; GFX90A-NEXT: v_and_b32_e32 v4, s8, v6 +; GFX90A-NEXT: v_and_b32_e32 v3, s6, v4 +; GFX90A-NEXT: v_and_b32_e32 v4, s6, v6 ; GFX90A-NEXT: v_sub_u32_e32 v0, v0, v1 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 15, v4 ; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] ; GFX90A-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX90A-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX90A-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NEXT: global_store_dword v2, v0, s[4:5] ; GFX90A-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX90A-NEXT: global_store_short v2, v0, s[2:3] offset:4 +; GFX90A-NEXT: global_store_short v2, v0, s[4:5] offset:4 ; GFX90A-NEXT: s_endpgm %r = urem <3 x i15> %x, %y store <3 x i15> %r, <3 x i15> addrspace(1)* %out @@ -6451,8 +6450,8 @@ ; ; GFX6-LABEL: sdiv_v3i15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 @@ -6523,48 +6522,48 @@ ; ; GFX9-LABEL: sdiv_v3i15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s1, s4, 0xf0000 -; GFX9-NEXT: s_bfe_i32 s0, s6, 0xf0000 +; GFX9-NEXT: s_bfe_i32 s1, s2, 0xf0000 +; GFX9-NEXT: s_bfe_i32 s0, s4, 0xf0000 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s1 ; GFX9-NEXT: s_xor_b32 s0, s1, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 30 -; GFX9-NEXT: s_or_b32 s5, s0, 1 +; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 +; GFX9-NEXT: s_or_b32 s3, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX9-NEXT: s_cselect_b32 s0, s5, 0 -; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf000f +; GFX9-NEXT: s_cselect_b32 s0, s3, 0 +; GFX9-NEXT: s_bfe_i32 s1, s4, 0xf000f ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s1 ; GFX9-NEXT: v_add_u32_e32 v4, s0, v5 -; GFX9-NEXT: s_bfe_i32 s0, s4, 0xf000f +; GFX9-NEXT: s_bfe_i32 s0, s2, 0xf000f ; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_alignbit_b32 v1, s7, v1, 30 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_alignbit_b32 v1, s5, v1, 30 ; GFX9-NEXT: s_xor_b32 s0, s0, s1 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: v_mad_f32 v5, -v6, v3, v5 ; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 -; GFX9-NEXT: s_or_b32 s4, s0, 1 +; GFX9-NEXT: s_or_b32 s2, s0, 1 ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v3| ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v1 ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s4, 0 +; GFX9-NEXT: s_cselect_b32 s0, s2, 0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 15 ; GFX9-NEXT: v_add_u32_e32 v5, s0, v6 ; GFX9-NEXT: v_cvt_f32_i32_e32 v6, v0 @@ -6586,55 +6585,55 @@ ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[2:3] +; GFX9-NEXT: global_store_dword v2, v0, s[6:7] ; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX9-NEXT: global_store_short v2, v0, s[2:3] offset:4 +; GFX9-NEXT: global_store_short v2, v0, s[6:7] offset:4 ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: sdiv_v3i15: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_bfe_i32 s1, s4, 0xf0000 -; GFX90A-NEXT: s_bfe_i32 s0, s6, 0xf0000 +; GFX90A-NEXT: s_bfe_i32 s1, s2, 0xf0000 +; GFX90A-NEXT: s_bfe_i32 s0, s4, 0xf0000 ; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s0 ; GFX90A-NEXT: v_cvt_f32_i32_e32 v4, s1 ; GFX90A-NEXT: s_xor_b32 s0, s1, s0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v0, s2 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v3 ; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 -; GFX90A-NEXT: v_alignbit_b32 v0, s5, v0, 30 -; GFX90A-NEXT: s_or_b32 s5, s0, 1 +; GFX90A-NEXT: v_alignbit_b32 v0, s3, v0, 30 +; GFX90A-NEXT: s_or_b32 s3, s0, 1 ; GFX90A-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 ; GFX90A-NEXT: v_mad_f32 v4, -v5, v3, v4 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| ; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX90A-NEXT: s_cselect_b32 s0, s5, 0 -; GFX90A-NEXT: s_bfe_i32 s1, s6, 0xf000f +; GFX90A-NEXT: s_cselect_b32 s0, s3, 0 +; GFX90A-NEXT: s_bfe_i32 s1, s4, 0xf000f ; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s1 ; GFX90A-NEXT: v_add_u32_e32 v4, s0, v5 -; GFX90A-NEXT: s_bfe_i32 s0, s4, 0xf000f +; GFX90A-NEXT: s_bfe_i32 s0, s2, 0xf000f ; GFX90A-NEXT: v_cvt_f32_i32_e32 v5, s0 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NEXT: v_alignbit_b32 v1, s7, v1, 30 +; GFX90A-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v1, s5, v1, 30 ; GFX90A-NEXT: s_xor_b32 s0, s0, s1 ; GFX90A-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX90A-NEXT: v_trunc_f32_e32 v6, v6 ; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 ; GFX90A-NEXT: v_mad_f32 v5, -v6, v3, v5 ; GFX90A-NEXT: v_bfe_i32 v1, v1, 0, 15 -; GFX90A-NEXT: s_or_b32 s4, s0, 1 +; GFX90A-NEXT: s_or_b32 s2, s0, 1 ; GFX90A-NEXT: v_cvt_i32_f32_e32 v6, v6 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v3| ; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, v1 ; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 +; GFX90A-NEXT: s_cselect_b32 s0, s2, 0 ; GFX90A-NEXT: v_bfe_i32 v0, v0, 0, 15 ; GFX90A-NEXT: v_add_u32_e32 v5, s0, v6 ; GFX90A-NEXT: v_cvt_f32_i32_e32 v6, v0 @@ -6656,9 +6655,9 @@ ; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] ; GFX90A-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX90A-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX90A-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NEXT: global_store_dword v2, v0, s[6:7] ; GFX90A-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX90A-NEXT: global_store_short v2, v0, s[2:3] offset:4 +; GFX90A-NEXT: global_store_short v2, v0, s[6:7] offset:4 ; GFX90A-NEXT: s_endpgm %r = sdiv <3 x i15> %x, %y store <3 x i15> %r, <3 x i15> addrspace(1)* %out @@ -6750,8 +6749,8 @@ ; ; GFX6-LABEL: srem_v3i15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 @@ -6836,53 +6835,54 @@ ; ; GFX9-LABEL: srem_v3i15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_movk_i32 s8, 0x7fff +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s4, s8 -; GFX9-NEXT: s_and_b32 s1, s6, s8 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 +; GFX9-NEXT: s_and_b32 s3, s2, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_alignbit_b32 v1, s1, v1, 30 +; GFX9-NEXT: s_and_b32 s1, s0, s8 ; GFX9-NEXT: s_bfe_i32 s1, s1, 0xf0000 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 -; GFX9-NEXT: s_bfe_i32 s0, s0, 0xf0000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: s_bfe_i32 s3, s3, 0xf0000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s3 +; GFX9-NEXT: s_xor_b32 s1, s3, s1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_ashr_i32 s1, s1, 30 +; GFX9-NEXT: s_lshr_b32 s9, s2, 15 +; GFX9-NEXT: s_bfe_u32 s10, s2, 0xf000f ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: v_mad_f32 v3, -v4, v2, v3 ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: s_lshr_b32 s9, s4, 15 -; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 30 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0xf000f -; GFX9-NEXT: v_alignbit_b32 v1, s7, v1, 30 -; GFX9-NEXT: s_lshr_b32 s7, s6, 15 -; GFX9-NEXT: s_bfe_u32 s10, s6, 0xf000f -; GFX9-NEXT: s_or_b32 s11, s0, 1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s11, 0 -; GFX9-NEXT: v_add_u32_e32 v2, s0, v4 +; GFX9-NEXT: s_lshr_b32 s11, s0, 15 +; GFX9-NEXT: s_bfe_u32 s12, s0, 0xf000f +; GFX9-NEXT: s_or_b32 s1, s1, 1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v3|, |v2| +; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GFX9-NEXT: s_cselect_b32 s1, s1, 0 +; GFX9-NEXT: v_add_u32_e32 v2, s1, v4 +; GFX9-NEXT: s_bfe_i32 s1, s12, 0xf0000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s1 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s0 ; GFX9-NEXT: s_bfe_i32 s0, s10, 0xf0000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 -; GFX9-NEXT: s_bfe_i32 s1, s5, 0xf0000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s1 -; GFX9-NEXT: s_xor_b32 s0, s1, s0 +; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s5, s0, 1 -; GFX9-NEXT: v_and_b32_e32 v1, s8, v1 +; GFX9-NEXT: s_or_b32 s3, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| +; GFX9-NEXT: v_and_b32_e32 v1, s8, v1 ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s5, 0 +; GFX9-NEXT: s_cselect_b32 s0, s3, 0 ; GFX9-NEXT: v_bfe_i32 v4, v1, 0, 15 ; GFX9-NEXT: v_add_u32_e32 v3, s0, v5 ; GFX9-NEXT: v_cvt_f32_i32_e32 v5, v4 @@ -6899,13 +6899,12 @@ ; GFX9-NEXT: v_mad_f32 v6, -v6, v5, v7 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v5| ; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v3, s7 +; GFX9-NEXT: v_mul_lo_u32 v3, v3, s11 ; GFX9-NEXT: v_add_u32_e32 v4, v8, v4 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s6 ; GFX9-NEXT: v_mul_lo_u32 v1, v4, v1 +; GFX9-NEXT: v_sub_u32_e32 v2, s2, v2 ; GFX9-NEXT: v_sub_u32_e32 v3, s9, v3 ; GFX9-NEXT: v_and_b32_e32 v3, s8, v3 -; GFX9-NEXT: v_sub_u32_e32 v2, s4, v2 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] ; GFX9-NEXT: v_and_b32_e32 v2, s8, v2 @@ -6913,63 +6912,63 @@ ; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX9-NEXT: global_store_dword v4, v0, s[2:3] +; GFX9-NEXT: global_store_dword v4, v0, s[4:5] ; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX9-NEXT: global_store_short v4, v0, s[2:3] offset:4 +; GFX9-NEXT: global_store_short v4, v0, s[4:5] offset:4 ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: srem_v3i15: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_and_b32 s0, s4, s8 -; GFX90A-NEXT: s_and_b32 s1, s6, s8 +; GFX90A-NEXT: s_and_b32 s6, s2, s8 +; GFX90A-NEXT: s_bfe_i32 s6, s6, 0xf0000 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v4, s6 +; GFX90A-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NEXT: v_alignbit_b32 v1, s1, v1, 30 +; GFX90A-NEXT: s_and_b32 s1, s0, s8 ; GFX90A-NEXT: s_bfe_i32 s1, s1, 0xf0000 ; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s1 -; GFX90A-NEXT: s_bfe_i32 s0, s0, 0xf0000 -; GFX90A-NEXT: v_cvt_f32_i32_e32 v4, s0 -; GFX90A-NEXT: s_xor_b32 s0, s0, s1 +; GFX90A-NEXT: s_xor_b32 s1, s6, s1 +; GFX90A-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NEXT: s_ashr_i32 s1, s1, 30 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v0, s4 -; GFX90A-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 +; GFX90A-NEXT: v_alignbit_b32 v0, s3, v0, 30 +; GFX90A-NEXT: s_lshr_b32 s3, s2, 15 +; GFX90A-NEXT: s_bfe_u32 s9, s2, 0xf000f ; GFX90A-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 ; GFX90A-NEXT: v_mad_f32 v4, -v5, v3, v4 ; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX90A-NEXT: v_alignbit_b32 v0, s5, v0, 30 -; GFX90A-NEXT: s_lshr_b32 s5, s4, 15 -; GFX90A-NEXT: s_bfe_u32 s9, s4, 0xf000f -; GFX90A-NEXT: v_alignbit_b32 v1, s7, v1, 30 -; GFX90A-NEXT: s_lshr_b32 s7, s6, 15 -; GFX90A-NEXT: s_bfe_u32 s10, s6, 0xf000f -; GFX90A-NEXT: s_or_b32 s11, s0, 1 -; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| -; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX90A-NEXT: s_cselect_b32 s0, s11, 0 -; GFX90A-NEXT: v_add_u32_e32 v3, s0, v5 -; GFX90A-NEXT: s_bfe_i32 s0, s10, 0xf0000 +; GFX90A-NEXT: s_lshr_b32 s10, s0, 15 +; GFX90A-NEXT: s_bfe_u32 s11, s0, 0xf000f +; GFX90A-NEXT: s_or_b32 s1, s1, 1 +; GFX90A-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, |v3| +; GFX90A-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GFX90A-NEXT: s_cselect_b32 s1, s1, 0 +; GFX90A-NEXT: v_add_u32_e32 v3, s1, v5 +; GFX90A-NEXT: v_mul_lo_u32 v3, v3, s0 +; GFX90A-NEXT: s_bfe_i32 s0, s11, 0xf0000 ; GFX90A-NEXT: v_cvt_f32_i32_e32 v4, s0 ; GFX90A-NEXT: s_bfe_i32 s1, s9, 0xf0000 ; GFX90A-NEXT: v_cvt_f32_i32_e32 v5, s1 ; GFX90A-NEXT: s_xor_b32 s0, s1, s0 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX90A-NEXT: v_mul_lo_u32 v3, v3, s6 ; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 -; GFX90A-NEXT: v_sub_u32_e32 v3, s4, v3 +; GFX90A-NEXT: v_sub_u32_e32 v3, s2, v3 +; GFX90A-NEXT: s_or_b32 s2, s0, 1 ; GFX90A-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX90A-NEXT: v_trunc_f32_e32 v6, v6 ; GFX90A-NEXT: v_mad_f32 v5, -v6, v4, v5 ; GFX90A-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX90A-NEXT: s_or_b32 s4, s0, 1 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| ; GFX90A-NEXT: v_and_b32_e32 v1, s8, v1 ; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 +; GFX90A-NEXT: s_cselect_b32 s0, s2, 0 ; GFX90A-NEXT: v_bfe_i32 v5, v1, 0, 15 ; GFX90A-NEXT: v_add_u32_e32 v4, s0, v6 ; GFX90A-NEXT: v_cvt_f32_i32_e32 v6, v5 @@ -6985,9 +6984,9 @@ ; GFX90A-NEXT: v_cvt_i32_f32_e32 v9, v7 ; GFX90A-NEXT: v_mad_f32 v7, -v7, v6, v8 ; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| -; GFX90A-NEXT: v_mul_lo_u32 v4, v4, s7 +; GFX90A-NEXT: v_mul_lo_u32 v4, v4, s10 ; GFX90A-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc -; GFX90A-NEXT: v_sub_u32_e32 v4, s5, v4 +; GFX90A-NEXT: v_sub_u32_e32 v4, s3, v4 ; GFX90A-NEXT: v_add_u32_e32 v5, v9, v5 ; GFX90A-NEXT: v_mul_lo_u32 v1, v5, v1 ; GFX90A-NEXT: v_and_b32_e32 v4, s8, v4 @@ -6997,9 +6996,9 @@ ; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] ; GFX90A-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX90A-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX90A-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NEXT: global_store_dword v2, v0, s[4:5] ; GFX90A-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX90A-NEXT: global_store_short v2, v0, s[2:3] offset:4 +; GFX90A-NEXT: global_store_short v2, v0, s[4:5] offset:4 ; GFX90A-NEXT: s_endpgm %r = srem <3 x i15> %x, %y store <3 x i15> %r, <3 x i15> addrspace(1)* %out @@ -7014,24 +7013,24 @@ ; ; GFX6-LABEL: udiv_i32_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0xb2a50881 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 20, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_i32_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_hi_u32 s0, s4, 0xb2a50881 @@ -7045,8 +7044,8 @@ ; ; GFX90A-LABEL: udiv_i32_oddk_denom: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_mul_hi_u32 s0, s4, 0xb2a50881 @@ -7070,20 +7069,20 @@ ; ; GFX6-LABEL: udiv_i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s0, s0, 12 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_lshr_b32 s4, s4, 12 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s0, s4, 12 @@ -7093,8 +7092,8 @@ ; ; GFX90A-LABEL: udiv_i32_pow2k_denom: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_lshr_b32 s0, s4, 12 @@ -7115,39 +7114,39 @@ ; ; GFX6-LABEL: udiv_i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_add_i32 s1, s1, 12 -; GFX6-NEXT: s_lshr_b32 s0, s0, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_add_i32 s5, s5, 12 +; GFX6-NEXT: s_lshr_b32 s4, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_i32 s0, s5, 12 -; GFX9-NEXT: s_lshr_b32 s0, s4, s0 +; GFX9-NEXT: s_add_i32 s0, s3, 12 +; GFX9-NEXT: s_lshr_b32 s0, s2, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: udiv_i32_pow2_shl_denom: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_add_i32 s0, s5, 12 -; GFX90A-NEXT: s_lshr_b32 s0, s4, s0 +; GFX90A-NEXT: s_add_i32 s0, s3, 12 +; GFX90A-NEXT: s_lshr_b32 s0, s2, s0 ; GFX90A-NEXT: v_mov_b32_e32 v1, s0 -; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NEXT: s_endpgm %shl.y = shl i32 4096, %y %r = udiv i32 %x, %shl.y @@ -7168,42 +7167,42 @@ ; ; GFX6-LABEL: udiv_v2i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s0, s0, 12 -; GFX6-NEXT: s_lshr_b32 s1, s1, 12 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: s_lshr_b32 s4, s4, 12 +; GFX6-NEXT: s_lshr_b32 s5, s5, 12 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s4, 12 -; GFX9-NEXT: s_lshr_b32 s1, s5, 12 +; GFX9-NEXT: s_lshr_b32 s0, s2, 12 +; GFX9-NEXT: s_lshr_b32 s1, s3, 12 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: udiv_v2i32_pow2k_denom: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_lshr_b32 s0, s4, 12 -; GFX90A-NEXT: s_lshr_b32 s1, s5, 12 +; GFX90A-NEXT: s_lshr_b32 s0, s2, 12 +; GFX90A-NEXT: s_lshr_b32 s1, s3, 12 ; GFX90A-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX90A-NEXT: s_endpgm %r = udiv <2 x i32> %x, store <2 x i32> %r, <2 x i32> addrspace(1)* %out @@ -7223,54 +7222,54 @@ ; ; GFX6-LABEL: udiv_v2i32_mixed_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x100101 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 -; GFX6-NEXT: s_lshr_b32 s0, s0, 12 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX6-NEXT: s_lshr_b32 s4, s4, 12 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 11, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_u32 s1, s5, 0x100101 -; GFX9-NEXT: s_lshr_b32 s0, s4, 12 -; GFX9-NEXT: s_sub_i32 s4, s5, s1 -; GFX9-NEXT: s_lshr_b32 s4, s4, 1 -; GFX9-NEXT: s_add_i32 s4, s4, s1 -; GFX9-NEXT: s_lshr_b32 s1, s4, 11 +; GFX9-NEXT: s_mul_hi_u32 s1, s3, 0x100101 +; GFX9-NEXT: s_lshr_b32 s0, s2, 12 +; GFX9-NEXT: s_sub_i32 s2, s3, s1 +; GFX9-NEXT: s_lshr_b32 s2, s2, 1 +; GFX9-NEXT: s_add_i32 s2, s2, s1 +; GFX9-NEXT: s_lshr_b32 s1, s2, 11 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: udiv_v2i32_mixed_pow2k_denom: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_mul_hi_u32 s1, s5, 0x100101 -; GFX90A-NEXT: s_lshr_b32 s0, s4, 12 -; GFX90A-NEXT: s_sub_i32 s4, s5, s1 -; GFX90A-NEXT: s_lshr_b32 s4, s4, 1 -; GFX90A-NEXT: s_add_i32 s4, s4, s1 -; GFX90A-NEXT: s_lshr_b32 s1, s4, 11 +; GFX90A-NEXT: s_mul_hi_u32 s1, s3, 0x100101 +; GFX90A-NEXT: s_lshr_b32 s0, s2, 12 +; GFX90A-NEXT: s_sub_i32 s2, s3, s1 +; GFX90A-NEXT: s_lshr_b32 s2, s2, 1 +; GFX90A-NEXT: s_add_i32 s2, s2, s1 +; GFX90A-NEXT: s_lshr_b32 s1, s2, 11 ; GFX90A-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX90A-NEXT: s_endpgm %r = udiv <2 x i32> %x, store <2 x i32> %r, <2 x i32> addrspace(1)* %out @@ -7421,11 +7420,11 @@ ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 @@ -7538,8 +7537,8 @@ ; ; GFX9-LABEL: urem_i32_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_hi_u32 s0, s4, 0xb2a50881 @@ -7555,8 +7554,8 @@ ; ; GFX90A-LABEL: urem_i32_oddk_denom: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_mul_hi_u32 s0, s4, 0xb2a50881 @@ -7582,20 +7581,20 @@ ; ; GFX6-LABEL: urem_i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s0, s0, 0xfff -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_and_b32 s4, s4, 0xfff +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s0, s4, 0xfff @@ -7605,8 +7604,8 @@ ; ; GFX90A-LABEL: urem_i32_pow2k_denom: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_and_b32 s0, s4, 0xfff @@ -7627,42 +7626,42 @@ ; ; GFX6-LABEL: urem_i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s1, 0x1000, s1 -; GFX6-NEXT: s_add_i32 s1, s1, -1 -; GFX6-NEXT: s_and_b32 s0, s0, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_lshl_b32 s5, 0x1000, s5 +; GFX6-NEXT: s_add_i32 s5, s5, -1 +; GFX6-NEXT: s_and_b32 s4, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s5 +; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s3 ; GFX9-NEXT: s_add_i32 s0, s0, -1 -; GFX9-NEXT: s_and_b32 s0, s4, s0 +; GFX9-NEXT: s_and_b32 s0, s2, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: urem_i32_pow2_shl_denom: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_lshl_b32 s0, 0x1000, s5 +; GFX90A-NEXT: s_lshl_b32 s0, 0x1000, s3 ; GFX90A-NEXT: s_add_i32 s0, s0, -1 -; GFX90A-NEXT: s_and_b32 s0, s4, s0 +; GFX90A-NEXT: s_and_b32 s0, s2, s0 ; GFX90A-NEXT: v_mov_b32_e32 v1, s0 -; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NEXT: s_endpgm %shl.y = shl i32 4096, %y %r = urem i32 %x, %shl.y @@ -7683,45 +7682,45 @@ ; ; GFX6-LABEL: urem_v2i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_movk_i32 s2, 0xfff -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_movk_i32 s6, 0xfff +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s0, s0, s2 -; GFX6-NEXT: s_and_b32 s1, s1, s2 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: s_and_b32 s4, s4, s6 +; GFX6-NEXT: s_and_b32 s5, s5, s6 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_movk_i32 s0, 0xfff ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s4, s0 -; GFX9-NEXT: s_and_b32 s0, s5, s0 +; GFX9-NEXT: s_and_b32 s1, s2, s0 +; GFX9-NEXT: s_and_b32 s0, s3, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: urem_v2i32_pow2k_denom: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX90A-NEXT: s_movk_i32 s0, 0xfff ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_and_b32 s1, s4, s0 -; GFX90A-NEXT: s_and_b32 s0, s5, s0 +; GFX90A-NEXT: s_and_b32 s1, s2, s0 +; GFX90A-NEXT: s_and_b32 s0, s3, s0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s1 ; GFX90A-NEXT: v_mov_b32_e32 v1, s0 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX90A-NEXT: s_endpgm %r = urem <2 x i32> %x, store <2 x i32> %r, <2 x i32> addrspace(1)* %out @@ -7798,50 +7797,50 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; GFX6-NEXT: s_movk_i32 s4, 0x1000 -; GFX6-NEXT: s_mov_b32 s5, 0x4f7ffffe -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s2, s4, s2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX6-NEXT: s_lshl_b32 s3, s4, s3 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX6-NEXT: s_sub_i32 s4, 0, s2 +; GFX6-NEXT: s_lshl_b32 s6, s4, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX6-NEXT: s_lshl_b32 s7, s4, s3 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX6-NEXT: s_mov_b32 s2, 0x4f7ffffe ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, s5, v0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: v_mul_f32_e32 v0, s2, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, s5, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, s2, v1 +; GFX6-NEXT: s_sub_i32 s2, 0, s6 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_lo_u32 v2, s4, v0 -; GFX6-NEXT: s_sub_i32 s4, 0, s3 -; GFX6-NEXT: v_mul_lo_u32 v3, s4, v1 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 +; GFX6-NEXT: s_sub_i32 s2, 0, s7 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: v_mul_lo_u32 v3, s2, v1 ; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s3 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v0 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 +; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s6, v0 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v0 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s6, v0 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s7, v1 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s7, v1 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v2i32_pow2_shl_denom: @@ -7954,24 +7953,24 @@ ; ; GFX6-LABEL: sdiv_i32_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0xd9528441 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mul_hi_i32 v0, s0, v0 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX6-NEXT: v_mul_hi_i32 v0, s4, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 20, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_i32_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_hi_i32 s0, s4, 0xd9528441 @@ -7985,8 +7984,8 @@ ; ; GFX90A-LABEL: sdiv_i32_oddk_denom: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_mul_hi_i32 s0, s4, 0xd9528441 @@ -8010,23 +8009,23 @@ ; ; GFX6-LABEL: sdiv_i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s1, s0, 31 -; GFX6-NEXT: s_lshr_b32 s1, s1, 20 -; GFX6-NEXT: s_add_i32 s0, s0, s1 -; GFX6-NEXT: s_ashr_i32 s0, s0, 12 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_ashr_i32 s5, s4, 31 +; GFX6-NEXT: s_lshr_b32 s5, s5, 20 +; GFX6-NEXT: s_add_i32 s4, s4, s5 +; GFX6-NEXT: s_ashr_i32 s4, s4, 12 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s0, s4, 31 @@ -8039,8 +8038,8 @@ ; ; GFX90A-LABEL: sdiv_i32_pow2k_denom: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_ashr_i32 s0, s4, 31 @@ -8196,60 +8195,60 @@ ; ; GFX6-LABEL: sdiv_v2i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s2, s0, 31 -; GFX6-NEXT: s_ashr_i32 s3, s1, 31 -; GFX6-NEXT: s_lshr_b32 s2, s2, 20 -; GFX6-NEXT: s_add_i32 s0, s0, s2 -; GFX6-NEXT: s_lshr_b32 s2, s3, 20 -; GFX6-NEXT: s_add_i32 s1, s1, s2 -; GFX6-NEXT: s_ashr_i32 s0, s0, 12 -; GFX6-NEXT: s_ashr_i32 s1, s1, 12 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: s_ashr_i32 s6, s4, 31 +; GFX6-NEXT: s_ashr_i32 s7, s5, 31 +; GFX6-NEXT: s_lshr_b32 s6, s6, 20 +; GFX6-NEXT: s_add_i32 s4, s4, s6 +; GFX6-NEXT: s_lshr_b32 s6, s7, 20 +; GFX6-NEXT: s_add_i32 s5, s5, s6 +; GFX6-NEXT: s_ashr_i32 s4, s4, 12 +; GFX6-NEXT: s_ashr_i32 s5, s5, 12 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s0, s4, 31 -; GFX9-NEXT: s_ashr_i32 s1, s5, 31 +; GFX9-NEXT: s_ashr_i32 s0, s2, 31 +; GFX9-NEXT: s_ashr_i32 s1, s3, 31 ; GFX9-NEXT: s_lshr_b32 s0, s0, 20 ; GFX9-NEXT: s_lshr_b32 s1, s1, 20 -; GFX9-NEXT: s_add_i32 s0, s4, s0 -; GFX9-NEXT: s_add_i32 s1, s5, s1 +; GFX9-NEXT: s_add_i32 s0, s2, s0 +; GFX9-NEXT: s_add_i32 s1, s3, s1 ; GFX9-NEXT: s_ashr_i32 s0, s0, 12 ; GFX9-NEXT: s_ashr_i32 s1, s1, 12 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: sdiv_v2i32_pow2k_denom: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_ashr_i32 s0, s4, 31 -; GFX90A-NEXT: s_ashr_i32 s1, s5, 31 +; GFX90A-NEXT: s_ashr_i32 s0, s2, 31 +; GFX90A-NEXT: s_ashr_i32 s1, s3, 31 ; GFX90A-NEXT: s_lshr_b32 s0, s0, 20 ; GFX90A-NEXT: s_lshr_b32 s1, s1, 20 -; GFX90A-NEXT: s_add_i32 s0, s4, s0 -; GFX90A-NEXT: s_add_i32 s1, s5, s1 +; GFX90A-NEXT: s_add_i32 s0, s2, s0 +; GFX90A-NEXT: s_add_i32 s1, s3, s1 ; GFX90A-NEXT: s_ashr_i32 s0, s0, 12 ; GFX90A-NEXT: s_ashr_i32 s1, s1, 12 ; GFX90A-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX90A-NEXT: s_endpgm %r = sdiv <2 x i32> %x, store <2 x i32> %r, <2 x i32> addrspace(1)* %out @@ -8269,63 +8268,63 @@ ; ; GFX6-LABEL: ssdiv_v2i32_mixed_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x80080081 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mul_hi_i32 v0, s1, v0 -; GFX6-NEXT: s_ashr_i32 s2, s0, 31 -; GFX6-NEXT: s_lshr_b32 s2, s2, 20 -; GFX6-NEXT: s_add_i32 s0, s0, s2 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s1, v0 -; GFX6-NEXT: s_ashr_i32 s0, s0, 12 +; GFX6-NEXT: v_mul_hi_i32 v0, s5, v0 +; GFX6-NEXT: s_ashr_i32 s6, s4, 31 +; GFX6-NEXT: s_lshr_b32 s6, s6, 20 +; GFX6-NEXT: s_add_i32 s4, s4, s6 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s5, v0 +; GFX6-NEXT: s_ashr_i32 s4, s4, 12 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 11, v0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s0, s4, 31 -; GFX9-NEXT: s_mul_hi_i32 s1, s5, 0x80080081 +; GFX9-NEXT: s_ashr_i32 s0, s2, 31 +; GFX9-NEXT: s_mul_hi_i32 s1, s3, 0x80080081 ; GFX9-NEXT: s_lshr_b32 s0, s0, 20 -; GFX9-NEXT: s_add_i32 s1, s1, s5 -; GFX9-NEXT: s_add_i32 s0, s4, s0 -; GFX9-NEXT: s_lshr_b32 s4, s1, 31 +; GFX9-NEXT: s_add_i32 s1, s1, s3 +; GFX9-NEXT: s_add_i32 s0, s2, s0 +; GFX9-NEXT: s_lshr_b32 s2, s1, 31 ; GFX9-NEXT: s_ashr_i32 s1, s1, 11 ; GFX9-NEXT: s_ashr_i32 s0, s0, 12 -; GFX9-NEXT: s_add_i32 s1, s1, s4 +; GFX9-NEXT: s_add_i32 s1, s1, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: ssdiv_v2i32_mixed_pow2k_denom: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_ashr_i32 s0, s4, 31 -; GFX90A-NEXT: s_mul_hi_i32 s1, s5, 0x80080081 +; GFX90A-NEXT: s_ashr_i32 s0, s2, 31 +; GFX90A-NEXT: s_mul_hi_i32 s1, s3, 0x80080081 ; GFX90A-NEXT: s_lshr_b32 s0, s0, 20 -; GFX90A-NEXT: s_add_i32 s1, s1, s5 -; GFX90A-NEXT: s_add_i32 s0, s4, s0 -; GFX90A-NEXT: s_lshr_b32 s4, s1, 31 +; GFX90A-NEXT: s_add_i32 s1, s1, s3 +; GFX90A-NEXT: s_add_i32 s0, s2, s0 +; GFX90A-NEXT: s_lshr_b32 s2, s1, 31 ; GFX90A-NEXT: s_ashr_i32 s1, s1, 11 ; GFX90A-NEXT: s_ashr_i32 s0, s0, 12 -; GFX90A-NEXT: s_add_i32 s1, s1, s4 +; GFX90A-NEXT: s_add_i32 s1, s1, s2 ; GFX90A-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX90A-NEXT: s_endpgm %r = sdiv <2 x i32> %x, store <2 x i32> %r, <2 x i32> addrspace(1)* %out @@ -8663,8 +8662,8 @@ ; ; GFX9-LABEL: srem_i32_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_hi_i32 s0, s4, 0xd9528441 @@ -8680,8 +8679,8 @@ ; ; GFX90A-LABEL: srem_i32_oddk_denom: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_mul_hi_i32 s0, s4, 0xd9528441 @@ -8707,24 +8706,24 @@ ; ; GFX6-LABEL: srem_i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s1, s0, 31 -; GFX6-NEXT: s_lshr_b32 s1, s1, 20 -; GFX6-NEXT: s_add_i32 s1, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s1, 0xfffff000 -; GFX6-NEXT: s_sub_i32 s0, s0, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_ashr_i32 s5, s4, 31 +; GFX6-NEXT: s_lshr_b32 s5, s5, 20 +; GFX6-NEXT: s_add_i32 s5, s4, s5 +; GFX6-NEXT: s_and_b32 s5, s5, 0xfffff000 +; GFX6-NEXT: s_sub_i32 s4, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s0, s4, 31 @@ -8738,8 +8737,8 @@ ; ; GFX90A-LABEL: srem_i32_pow2k_denom: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_ashr_i32 s0, s4, 31 @@ -8887,69 +8886,69 @@ ; ; GFX6-LABEL: srem_v2i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_movk_i32 s2, 0xf000 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_movk_i32 s6, 0xf000 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s3, s0, 31 -; GFX6-NEXT: s_lshr_b32 s3, s3, 20 -; GFX6-NEXT: s_add_i32 s3, s0, s3 -; GFX6-NEXT: s_and_b32 s3, s3, s2 -; GFX6-NEXT: s_sub_i32 s0, s0, s3 -; GFX6-NEXT: s_ashr_i32 s3, s1, 31 -; GFX6-NEXT: s_lshr_b32 s3, s3, 20 -; GFX6-NEXT: s_add_i32 s3, s1, s3 -; GFX6-NEXT: s_and_b32 s2, s3, s2 -; GFX6-NEXT: s_sub_i32 s1, s1, s2 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: s_ashr_i32 s7, s4, 31 +; GFX6-NEXT: s_lshr_b32 s7, s7, 20 +; GFX6-NEXT: s_add_i32 s7, s4, s7 +; GFX6-NEXT: s_and_b32 s7, s7, s6 +; GFX6-NEXT: s_sub_i32 s4, s4, s7 +; GFX6-NEXT: s_ashr_i32 s7, s5, 31 +; GFX6-NEXT: s_lshr_b32 s7, s7, 20 +; GFX6-NEXT: s_add_i32 s7, s5, s7 +; GFX6-NEXT: s_and_b32 s6, s7, s6 +; GFX6-NEXT: s_sub_i32 s5, s5, s6 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX9-NEXT: s_movk_i32 s6, 0xf000 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_movk_i32 s0, 0xf000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s0, s4, 31 -; GFX9-NEXT: s_ashr_i32 s1, s5, 31 -; GFX9-NEXT: s_lshr_b32 s0, s0, 20 +; GFX9-NEXT: s_ashr_i32 s1, s2, 31 ; GFX9-NEXT: s_lshr_b32 s1, s1, 20 -; GFX9-NEXT: s_add_i32 s0, s4, s0 -; GFX9-NEXT: s_add_i32 s1, s5, s1 -; GFX9-NEXT: s_and_b32 s0, s0, s6 -; GFX9-NEXT: s_and_b32 s1, s1, s6 -; GFX9-NEXT: s_sub_i32 s0, s4, s0 -; GFX9-NEXT: s_sub_i32 s1, s5, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: s_add_i32 s1, s2, s1 +; GFX9-NEXT: s_ashr_i32 s6, s3, 31 +; GFX9-NEXT: s_and_b32 s1, s1, s0 +; GFX9-NEXT: s_sub_i32 s1, s2, s1 +; GFX9-NEXT: s_lshr_b32 s2, s6, 20 +; GFX9-NEXT: s_add_i32 s2, s3, s2 +; GFX9-NEXT: s_and_b32 s0, s2, s0 +; GFX9-NEXT: s_sub_i32 s0, s3, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: srem_v2i32_pow2k_denom: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX90A-NEXT: s_movk_i32 s6, 0xf000 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX90A-NEXT: s_movk_i32 s0, 0xf000 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_ashr_i32 s0, s4, 31 -; GFX90A-NEXT: s_ashr_i32 s1, s5, 31 -; GFX90A-NEXT: s_lshr_b32 s0, s0, 20 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_ashr_i32 s1, s2, 31 ; GFX90A-NEXT: s_lshr_b32 s1, s1, 20 -; GFX90A-NEXT: s_add_i32 s0, s4, s0 -; GFX90A-NEXT: s_add_i32 s1, s5, s1 -; GFX90A-NEXT: s_and_b32 s0, s0, s6 -; GFX90A-NEXT: s_and_b32 s1, s1, s6 -; GFX90A-NEXT: s_sub_i32 s0, s4, s0 -; GFX90A-NEXT: s_sub_i32 s1, s5, s1 -; GFX90A-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX90A-NEXT: s_add_i32 s1, s2, s1 +; GFX90A-NEXT: s_ashr_i32 s6, s3, 31 +; GFX90A-NEXT: s_and_b32 s1, s1, s0 +; GFX90A-NEXT: s_sub_i32 s1, s2, s1 +; GFX90A-NEXT: s_lshr_b32 s2, s6, 20 +; GFX90A-NEXT: s_add_i32 s2, s3, s2 +; GFX90A-NEXT: s_and_b32 s0, s2, s0 +; GFX90A-NEXT: s_sub_i32 s0, s3, s0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX90A-NEXT: s_endpgm %r = srem <2 x i32> %x, store <2 x i32> %r, <2 x i32> addrspace(1)* %out @@ -9106,130 +9105,130 @@ ; ; GFX9-LABEL: srem_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; GFX9-NEXT: s_movk_i32 s8, 0x1000 -; GFX9-NEXT: s_mov_b32 s9, 0x4f7ffffe -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c +; GFX9-NEXT: s_movk_i32 s0, 0x1000 +; GFX9-NEXT: s_mov_b32 s8, 0x4f7ffffe ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s0, s8, s6 -; GFX9-NEXT: s_ashr_i32 s1, s0, 31 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX9-NEXT: s_lshl_b32 s1, s8, s7 -; GFX9-NEXT: s_ashr_i32 s6, s1, 31 -; GFX9-NEXT: s_add_i32 s1, s1, s6 -; GFX9-NEXT: s_xor_b32 s1, s1, s6 +; GFX9-NEXT: s_lshl_b32 s1, s0, s2 +; GFX9-NEXT: s_ashr_i32 s2, s1, 31 +; GFX9-NEXT: s_add_i32 s1, s1, s2 +; GFX9-NEXT: s_xor_b32 s1, s1, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 +; GFX9-NEXT: s_lshl_b32 s0, s0, s3 +; GFX9-NEXT: s_ashr_i32 s2, s0, 31 +; GFX9-NEXT: s_add_i32 s0, s0, s2 +; GFX9-NEXT: s_xor_b32 s0, s0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s1 -; GFX9-NEXT: s_sub_i32 s7, 0, s0 -; GFX9-NEXT: s_ashr_i32 s6, s4, 31 -; GFX9-NEXT: v_mul_f32_e32 v0, s9, v0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s0 +; GFX9-NEXT: s_sub_i32 s3, 0, s1 +; GFX9-NEXT: s_ashr_i32 s2, s6, 31 +; GFX9-NEXT: v_mul_f32_e32 v0, s8, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_add_i32 s4, s4, s6 -; GFX9-NEXT: s_xor_b32 s4, s4, s6 -; GFX9-NEXT: v_mul_f32_e32 v1, s9, v1 -; GFX9-NEXT: v_mul_lo_u32 v3, s7, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mul_f32_e32 v1, s8, v1 +; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_sub_i32 s7, 0, s1 +; GFX9-NEXT: s_add_i32 s3, s6, s2 +; GFX9-NEXT: s_sub_i32 s6, 0, s0 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, v3 -; GFX9-NEXT: v_mul_lo_u32 v4, s7, v1 -; GFX9-NEXT: s_ashr_i32 s7, s5, 31 -; GFX9-NEXT: s_add_i32 s5, s5, s7 +; GFX9-NEXT: v_mul_lo_u32 v4, s6, v1 +; GFX9-NEXT: s_xor_b32 s3, s3, s2 +; GFX9-NEXT: s_ashr_i32 s6, s7, 31 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v3 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v4 -; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX9-NEXT: s_xor_b32 s5, s5, s7 +; GFX9-NEXT: v_mul_hi_u32 v0, s3, v0 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_xor_b32 s7, s7, s6 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, s1 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 -; GFX9-NEXT: v_subrev_u32_e32 v3, s0, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v0 +; GFX9-NEXT: v_mul_hi_u32 v1, s7, v1 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s1 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, s0 +; GFX9-NEXT: v_sub_u32_e32 v0, s3, v0 +; GFX9-NEXT: v_subrev_u32_e32 v3, s1, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s0, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s1, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v0 +; GFX9-NEXT: v_sub_u32_e32 v1, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s0, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s0, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, s7, v1 -; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 -; GFX9-NEXT: v_subrev_u32_e32 v1, s7, v1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 +; GFX9-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX9-NEXT: v_subrev_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_subrev_u32_e32 v1, s6, v1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: srem_v2i32_pow2_shl_denom: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; GFX90A-NEXT: s_movk_i32 s8, 0x1000 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c +; GFX90A-NEXT: s_movk_i32 s0, 0x1000 ; GFX90A-NEXT: s_mov_b32 s9, 0x4f7ffffe -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_lshl_b32 s0, s8, s6 -; GFX90A-NEXT: s_ashr_i32 s1, s0, 31 -; GFX90A-NEXT: s_add_i32 s0, s0, s1 -; GFX90A-NEXT: s_xor_b32 s0, s0, s1 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX90A-NEXT: s_lshl_b32 s1, s8, s7 -; GFX90A-NEXT: s_sub_i32 s8, 0, s0 -; GFX90A-NEXT: s_ashr_i32 s6, s4, 31 +; GFX90A-NEXT: s_lshl_b32 s1, s0, s2 +; GFX90A-NEXT: s_ashr_i32 s2, s1, 31 +; GFX90A-NEXT: s_add_i32 s1, s1, s2 +; GFX90A-NEXT: s_xor_b32 s1, s1, s2 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s1 +; GFX90A-NEXT: s_sub_i32 s8, 0, s1 +; GFX90A-NEXT: s_ashr_i32 s2, s6, 31 +; GFX90A-NEXT: s_lshl_b32 s0, s0, s3 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX90A-NEXT: s_add_i32 s4, s4, s6 -; GFX90A-NEXT: s_xor_b32 s4, s4, s6 -; GFX90A-NEXT: s_ashr_i32 s7, s1, 31 +; GFX90A-NEXT: s_add_i32 s3, s6, s2 +; GFX90A-NEXT: s_xor_b32 s3, s3, s2 +; GFX90A-NEXT: s_ashr_i32 s6, s0, 31 ; GFX90A-NEXT: v_mul_f32_e32 v0, s9, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX90A-NEXT: s_add_i32 s1, s1, s7 -; GFX90A-NEXT: s_xor_b32 s1, s1, s7 +; GFX90A-NEXT: s_add_i32 s0, s0, s6 +; GFX90A-NEXT: s_xor_b32 s0, s0, s6 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mul_lo_u32 v1, s8, v0 ; GFX90A-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX90A-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX90A-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s0 -; GFX90A-NEXT: v_sub_u32_e32 v0, s4, v0 -; GFX90A-NEXT: v_subrev_u32_e32 v1, s0, v0 -; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v0 +; GFX90A-NEXT: v_mul_hi_u32 v0, s3, v0 +; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s1 +; GFX90A-NEXT: v_sub_u32_e32 v0, s3, v0 +; GFX90A-NEXT: v_subrev_u32_e32 v1, s1, v0 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s1, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s1 -; GFX90A-NEXT: v_subrev_u32_e32 v3, s0, v0 -; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v0 -; GFX90A-NEXT: s_ashr_i32 s0, s5, 31 -; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX90A-NEXT: s_add_i32 s4, s5, s0 -; GFX90A-NEXT: s_sub_i32 s5, 0, s1 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s0 +; GFX90A-NEXT: v_subrev_u32_e32 v3, s1, v0 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s1, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX90A-NEXT: s_sub_i32 s3, 0, s0 +; GFX90A-NEXT: v_xor_b32_e32 v0, s2, v0 +; GFX90A-NEXT: s_ashr_i32 s1, s7, 31 ; GFX90A-NEXT: v_mul_f32_e32 v1, s9, v1 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX90A-NEXT: s_xor_b32 s4, s4, s0 -; GFX90A-NEXT: v_xor_b32_e32 v0, s6, v0 -; GFX90A-NEXT: v_subrev_u32_e32 v0, s6, v0 -; GFX90A-NEXT: v_mul_lo_u32 v3, s5, v1 +; GFX90A-NEXT: v_subrev_u32_e32 v0, s2, v0 +; GFX90A-NEXT: s_add_i32 s2, s7, s1 +; GFX90A-NEXT: s_xor_b32 s2, s2, s1 +; GFX90A-NEXT: v_mul_lo_u32 v3, s3, v1 ; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX90A-NEXT: v_mul_hi_u32 v1, s4, v1 -; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s1 -; GFX90A-NEXT: v_sub_u32_e32 v1, s4, v1 -; GFX90A-NEXT: v_subrev_u32_e32 v3, s1, v1 -; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s1, v1 +; GFX90A-NEXT: v_mul_hi_u32 v1, s2, v1 +; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s0 +; GFX90A-NEXT: v_sub_u32_e32 v1, s2, v1 +; GFX90A-NEXT: v_subrev_u32_e32 v3, s0, v1 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v1 ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX90A-NEXT: v_subrev_u32_e32 v3, s1, v1 -; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s1, v1 +; GFX90A-NEXT: v_subrev_u32_e32 v3, s0, v1 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v1 ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX90A-NEXT: v_xor_b32_e32 v1, s0, v1 -; GFX90A-NEXT: v_subrev_u32_e32 v1, s0, v1 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX90A-NEXT: v_xor_b32_e32 v1, s1, v1 +; GFX90A-NEXT: v_subrev_u32_e32 v1, s1, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX90A-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y %r = srem <2 x i32> %x, %shl.y @@ -9669,8 +9668,8 @@ ; ; GFX9-LABEL: udiv_i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_i32 s2, s2, 12 @@ -9682,8 +9681,8 @@ ; ; GFX90A-LABEL: udiv_i64_pow2_shl_denom: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_add_i32 s2, s2, 12 @@ -9710,24 +9709,24 @@ ; ; GFX6-LABEL: udiv_v2i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], 12 -; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], 12 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: v_mov_b32_e32 v3, s3 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], 12 +; GFX6-NEXT: s_lshr_b64 s[6:7], s[6:7], 12 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_v2i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 12 @@ -9741,8 +9740,8 @@ ; ; GFX90A-LABEL: udiv_v2i64_pow2k_denom: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_lshr_b64 s[0:1], s[4:5], 12 @@ -10106,34 +10105,34 @@ ; ; GFX6-LABEL: udiv_v2i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_add_i32 s0, s0, 12 -; GFX6-NEXT: s_add_i32 s2, s2, 12 -; GFX6-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 -; GFX6-NEXT: s_lshr_b64 s[2:3], s[10:11], s2 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: v_mov_b32_e32 v3, s3 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX6-NEXT: s_add_i32 s4, s4, 12 +; GFX6-NEXT: s_add_i32 s6, s6, 12 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[8:9], s4 +; GFX6-NEXT: s_lshr_b64 s[6:7], s[10:11], s6 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_i32 s0, s8, 12 -; GFX9-NEXT: s_add_i32 s8, s10, 12 -; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], s0 -; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], s8 +; GFX9-NEXT: s_add_i32 s0, s4, 12 +; GFX9-NEXT: s_add_i32 s4, s6, 12 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[10:11], s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 @@ -10143,15 +10142,15 @@ ; ; GFX90A-LABEL: udiv_v2i64_pow2_shl_denom: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 +; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_add_i32 s0, s8, 12 -; GFX90A-NEXT: s_add_i32 s8, s10, 12 -; GFX90A-NEXT: s_lshr_b64 s[0:1], s[4:5], s0 -; GFX90A-NEXT: s_lshr_b64 s[4:5], s[6:7], s8 +; GFX90A-NEXT: s_add_i32 s0, s4, 12 +; GFX90A-NEXT: s_add_i32 s4, s6, 12 +; GFX90A-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 +; GFX90A-NEXT: s_lshr_b64 s[4:5], s[10:11], s4 ; GFX90A-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-NEXT: v_mov_b32_e32 v2, s4 @@ -10537,16 +10536,16 @@ ; ; GFX6-LABEL: urem_i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_and_b32 s4, s6, 0xfff -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_and_b32 s0, s2, 0xfff +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_i64_pow2k_denom: @@ -10601,8 +10600,8 @@ ; ; GFX9-LABEL: urem_i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[0:1], 0x1000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -10617,8 +10616,8 @@ ; ; GFX90A-LABEL: urem_i64_pow2_shl_denom: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_mov_b64 s[0:1], 0x1000 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -10648,25 +10647,25 @@ ; ; GFX6-LABEL: urem_v2i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd -; GFX6-NEXT: s_movk_i32 s8, 0xfff -; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s0, s0, s8 -; GFX6-NEXT: s_and_b32 s1, s2, s8 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 +; GFX6-NEXT: s_movk_i32 s5, 0xfff +; GFX6-NEXT: v_mov_b32_e32 v1, 0 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_and_b32 s4, s4, s5 +; GFX6-NEXT: s_and_b32 s5, s6, s5 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v2i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_movk_i32 s0, 0xfff ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, v1 @@ -10680,8 +10679,8 @@ ; ; GFX90A-LABEL: urem_v2i64_pow2k_denom: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_movk_i32 s0, 0xfff ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 @@ -10737,20 +10736,20 @@ ; ; GFX9-LABEL: urem_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b64 s[0:1], 0x1000 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], s10 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 +; GFX9-NEXT: s_lshl_b64 s[6:7], s[0:1], s6 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 ; GFX9-NEXT: s_add_u32 s0, s0, -1 ; GFX9-NEXT: s_addc_u32 s1, s1, -1 -; GFX9-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1] -; GFX9-NEXT: s_add_u32 s4, s10, -1 -; GFX9-NEXT: s_addc_u32 s5, s11, -1 -; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; GFX9-NEXT: s_and_b64 s[0:1], s[8:9], s[0:1] +; GFX9-NEXT: s_add_u32 s4, s6, -1 +; GFX9-NEXT: s_addc_u32 s5, s7, -1 +; GFX9-NEXT: s_and_b64 s[4:5], s[10:11], s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 @@ -10760,20 +10759,20 @@ ; ; GFX90A-LABEL: urem_v2i64_pow2_shl_denom: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 +; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX90A-NEXT: s_mov_b64 s[0:1], 0x1000 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_lshl_b64 s[10:11], s[0:1], s10 -; GFX90A-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 +; GFX90A-NEXT: s_lshl_b64 s[6:7], s[0:1], s6 +; GFX90A-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 ; GFX90A-NEXT: s_add_u32 s0, s0, -1 ; GFX90A-NEXT: s_addc_u32 s1, s1, -1 -; GFX90A-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1] -; GFX90A-NEXT: s_add_u32 s4, s10, -1 -; GFX90A-NEXT: s_addc_u32 s5, s11, -1 -; GFX90A-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; GFX90A-NEXT: s_and_b64 s[0:1], s[8:9], s[0:1] +; GFX90A-NEXT: s_add_u32 s4, s6, -1 +; GFX90A-NEXT: s_addc_u32 s5, s7, -1 +; GFX90A-NEXT: s_and_b64 s[4:5], s[10:11], s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-NEXT: v_mov_b32_e32 v2, s4 @@ -11615,32 +11614,32 @@ ; ; GFX6-LABEL: sdiv_v2i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s8, s1, 31 +; GFX6-NEXT: s_ashr_i32 s8, s5, 31 ; GFX6-NEXT: s_lshr_b32 s8, s8, 20 -; GFX6-NEXT: s_add_u32 s0, s0, s8 -; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_ashr_i32 s8, s3, 31 -; GFX6-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 +; GFX6-NEXT: s_add_u32 s4, s4, s8 +; GFX6-NEXT: s_addc_u32 s5, s5, 0 +; GFX6-NEXT: s_ashr_i32 s8, s7, 31 +; GFX6-NEXT: s_ashr_i64 s[4:5], s[4:5], 12 ; GFX6-NEXT: s_lshr_b32 s8, s8, 20 -; GFX6-NEXT: s_add_u32 s2, s2, s8 -; GFX6-NEXT: s_addc_u32 s3, s3, 0 -; GFX6-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: v_mov_b32_e32 v3, s3 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX6-NEXT: s_add_u32 s6, s6, s8 +; GFX6-NEXT: s_addc_u32 s7, s7, 0 +; GFX6-NEXT: s_ashr_i64 s[6:7], s[6:7], 12 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v2i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s0, s5, 31 @@ -11662,8 +11661,8 @@ ; ; GFX90A-LABEL: sdiv_v2i64_pow2k_denom: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_ashr_i32 s0, s5, 31 @@ -11948,8 +11947,8 @@ ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX90A-NEXT: v_mac_f32_e32 v0, 0, v1 ; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 @@ -13216,22 +13215,22 @@ ; ; GFX6-LABEL: srem_i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_ashr_i32 s4, s7, 31 -; GFX6-NEXT: s_lshr_b32 s4, s4, 20 -; GFX6-NEXT: s_add_u32 s4, s6, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: s_addc_u32 s5, s7, 0 -; GFX6-NEXT: s_and_b32 s4, s4, 0xfffff000 -; GFX6-NEXT: s_sub_u32 s4, s6, s4 -; GFX6-NEXT: s_subb_u32 s5, s7, s5 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_ashr_i32 s0, s3, 31 +; GFX6-NEXT: s_lshr_b32 s0, s0, 20 +; GFX6-NEXT: s_add_u32 s0, s2, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_addc_u32 s1, s3, 0 +; GFX6-NEXT: s_and_b32 s0, s0, 0xfffff000 +; GFX6-NEXT: s_sub_u32 s0, s2, s0 +; GFX6-NEXT: s_subb_u32 s1, s3, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_i64_pow2k_denom: @@ -13696,85 +13695,85 @@ ; ; GFX6-LABEL: srem_v2i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_movk_i32 s8, 0xf000 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s9, s1, 31 +; GFX6-NEXT: s_ashr_i32 s9, s5, 31 ; GFX6-NEXT: s_lshr_b32 s9, s9, 20 -; GFX6-NEXT: s_add_u32 s9, s0, s9 -; GFX6-NEXT: s_addc_u32 s10, s1, 0 +; GFX6-NEXT: s_add_u32 s9, s4, s9 +; GFX6-NEXT: s_addc_u32 s10, s5, 0 ; GFX6-NEXT: s_and_b32 s9, s9, s8 -; GFX6-NEXT: s_sub_u32 s0, s0, s9 -; GFX6-NEXT: s_subb_u32 s1, s1, s10 -; GFX6-NEXT: s_ashr_i32 s9, s3, 31 +; GFX6-NEXT: s_sub_u32 s4, s4, s9 +; GFX6-NEXT: s_subb_u32 s5, s5, s10 +; GFX6-NEXT: s_ashr_i32 s9, s7, 31 ; GFX6-NEXT: s_lshr_b32 s9, s9, 20 -; GFX6-NEXT: s_add_u32 s9, s2, s9 -; GFX6-NEXT: s_addc_u32 s10, s3, 0 +; GFX6-NEXT: s_add_u32 s9, s6, s9 +; GFX6-NEXT: s_addc_u32 s10, s7, 0 ; GFX6-NEXT: s_and_b32 s8, s9, s8 -; GFX6-NEXT: s_sub_u32 s2, s2, s8 -; GFX6-NEXT: s_subb_u32 s3, s3, s10 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: v_mov_b32_e32 v3, s3 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX6-NEXT: s_sub_u32 s6, s6, s8 +; GFX6-NEXT: s_subb_u32 s7, s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_v2i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: s_movk_i32 s8, 0xf000 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_movk_i32 s0, 0xf000 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s0, s5, 31 -; GFX9-NEXT: s_lshr_b32 s0, s0, 20 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, 0 -; GFX9-NEXT: s_and_b32 s0, s0, s8 -; GFX9-NEXT: s_sub_u32 s0, s4, s0 -; GFX9-NEXT: s_subb_u32 s1, s5, s1 -; GFX9-NEXT: s_ashr_i32 s4, s7, 31 -; GFX9-NEXT: s_lshr_b32 s4, s4, 20 -; GFX9-NEXT: s_add_u32 s4, s6, s4 -; GFX9-NEXT: s_addc_u32 s5, s7, 0 -; GFX9-NEXT: s_and_b32 s4, s4, s8 -; GFX9-NEXT: s_sub_u32 s4, s6, s4 -; GFX9-NEXT: s_subb_u32 s5, s7, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_ashr_i32 s1, s5, 31 +; GFX9-NEXT: s_lshr_b32 s1, s1, 20 +; GFX9-NEXT: s_add_u32 s1, s4, s1 +; GFX9-NEXT: s_addc_u32 s8, s5, 0 +; GFX9-NEXT: s_and_b32 s1, s1, s0 +; GFX9-NEXT: s_sub_u32 s1, s4, s1 +; GFX9-NEXT: s_subb_u32 s4, s5, s8 +; GFX9-NEXT: s_ashr_i32 s5, s7, 31 +; GFX9-NEXT: s_lshr_b32 s5, s5, 20 +; GFX9-NEXT: s_add_u32 s5, s6, s5 +; GFX9-NEXT: s_addc_u32 s8, s7, 0 +; GFX9-NEXT: s_and_b32 s0, s5, s0 +; GFX9-NEXT: s_sub_u32 s0, s6, s0 +; GFX9-NEXT: s_subb_u32 s5, s7, s8 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: srem_v2i64_pow2k_denom: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX90A-NEXT: s_movk_i32 s8, 0xf000 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_movk_i32 s0, 0xf000 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_ashr_i32 s0, s5, 31 -; GFX90A-NEXT: s_lshr_b32 s0, s0, 20 -; GFX90A-NEXT: s_add_u32 s0, s4, s0 -; GFX90A-NEXT: s_addc_u32 s1, s5, 0 -; GFX90A-NEXT: s_and_b32 s0, s0, s8 -; GFX90A-NEXT: s_sub_u32 s0, s4, s0 -; GFX90A-NEXT: s_subb_u32 s1, s5, s1 -; GFX90A-NEXT: s_ashr_i32 s4, s7, 31 -; GFX90A-NEXT: s_lshr_b32 s4, s4, 20 -; GFX90A-NEXT: s_add_u32 s4, s6, s4 -; GFX90A-NEXT: s_addc_u32 s5, s7, 0 -; GFX90A-NEXT: s_and_b32 s4, s4, s8 -; GFX90A-NEXT: s_sub_u32 s4, s6, s4 -; GFX90A-NEXT: s_subb_u32 s5, s7, s5 -; GFX90A-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: s_ashr_i32 s1, s5, 31 +; GFX90A-NEXT: s_lshr_b32 s1, s1, 20 +; GFX90A-NEXT: s_add_u32 s1, s4, s1 +; GFX90A-NEXT: s_addc_u32 s8, s5, 0 +; GFX90A-NEXT: s_and_b32 s1, s1, s0 +; GFX90A-NEXT: s_sub_u32 s1, s4, s1 +; GFX90A-NEXT: s_subb_u32 s4, s5, s8 +; GFX90A-NEXT: s_ashr_i32 s5, s7, 31 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 20 +; GFX90A-NEXT: s_add_u32 s5, s6, s5 +; GFX90A-NEXT: s_addc_u32 s8, s7, 0 +; GFX90A-NEXT: s_and_b32 s0, s5, s0 +; GFX90A-NEXT: s_sub_u32 s0, s6, s0 +; GFX90A-NEXT: s_subb_u32 s5, s7, s8 +; GFX90A-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: v_mov_b32_e32 v3, s5 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX90A-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll --- a/llvm/test/CodeGen/AMDGPU/and.ll +++ b/llvm/test/CodeGen/AMDGPU/and.ll @@ -404,8 +404,8 @@ ; FUNC-LABEL: {{^}}s_and_inline_imm_1.0_i64 ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1.0 -; SI: s_load_dwordx2 ; SI: s_load_dword +; SI: s_load_dwordx2 ; SI-NOT: and ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3ff00000 ; SI-NOT: and @@ -419,8 +419,8 @@ ; FUNC-LABEL: {{^}}s_and_inline_imm_neg_1.0_i64 ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -1.0 -; SI: s_load_dwordx2 ; SI: s_load_dword +; SI: s_load_dwordx2 ; SI-NOT: and ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbff00000 ; SI-NOT: and @@ -434,8 +434,8 @@ ; FUNC-LABEL: {{^}}s_and_inline_imm_0.5_i64 ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0.5 -; SI: s_load_dwordx2 ; SI: s_load_dword +; SI: s_load_dwordx2 ; SI-NOT: and ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3fe00000 ; SI-NOT: and @@ -449,8 +449,8 @@ ; FUNC-LABEL: {{^}}s_and_inline_imm_neg_0.5_i64: ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -0.5 -; SI: s_load_dwordx2 ; SI: s_load_dword +; SI: s_load_dwordx2 ; SI-NOT: and ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbfe00000 ; SI-NOT: and @@ -462,8 +462,8 @@ } ; FUNC-LABEL: {{^}}s_and_inline_imm_2.0_i64: -; SI: s_load_dwordx2 ; SI: s_load_dword +; SI: s_load_dwordx2 ; SI-NOT: and ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 2.0 ; SI-NOT: and @@ -475,8 +475,8 @@ } ; FUNC-LABEL: {{^}}s_and_inline_imm_neg_2.0_i64: -; SI: s_load_dwordx2 ; SI: s_load_dword +; SI: s_load_dwordx2 ; SI-NOT: and ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, -2.0 ; SI-NOT: and @@ -490,8 +490,8 @@ ; FUNC-LABEL: {{^}}s_and_inline_imm_4.0_i64: ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 4.0 -; SI: s_load_dwordx2 ; SI: s_load_dword +; SI: s_load_dwordx2 ; SI-NOT: and ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x40100000 ; SI-NOT: and @@ -505,8 +505,8 @@ ; FUNC-LABEL: {{^}}s_and_inline_imm_neg_4.0_i64: ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -4.0 -; SI: s_load_dwordx2 ; SI: s_load_dword +; SI: s_load_dwordx2 ; SI-NOT: and ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xc0100000 ; SI-NOT: and @@ -522,8 +522,8 @@ ; low 32-bits, which is not a valid 64-bit inline immmediate. ; FUNC-LABEL: {{^}}s_and_inline_imm_f32_4.0_i64: -; SI: s_load_dwordx2 ; SI: s_load_dword s +; SI: s_load_dwordx2 ; SI-NOT: and ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0 ; SI-NOT: and @@ -548,8 +548,8 @@ } ; Shift into upper 32-bits -; SI: s_load_dwordx2 ; SI: s_load_dword +; SI: s_load_dwordx2 ; SI-NOT: and ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0 ; SI-NOT: and @@ -561,8 +561,8 @@ } ; FUNC-LABEL: {{^}}s_and_inline_high_imm_f32_neg_4.0_i64: -; SI: s_load_dwordx2 ; SI: s_load_dword +; SI: s_load_dwordx2 ; SI-NOT: and ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, -4.0 ; SI-NOT: and diff --git a/llvm/test/CodeGen/AMDGPU/anyext.ll b/llvm/test/CodeGen/AMDGPU/anyext.ll --- a/llvm/test/CodeGen/AMDGPU/anyext.ll +++ b/llvm/test/CodeGen/AMDGPU/anyext.ll @@ -9,36 +9,36 @@ define amdgpu_kernel void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) #0 { ; GCN-LABEL: anyext_i1_i32: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GFX8-LABEL: anyext_i1_i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c -; GFX8-NEXT: s_mov_b32 s7, 0xf000 -; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_cmp_eq_u32 s0, 0 -; GFX8-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX8-NEXT: s_cmp_eq_u32 s4, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: anyext_i1_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -89,49 +89,45 @@ ; GFX8-LABEL: s_anyext_i16_i32: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, s7 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v1 -; GFX8-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_ushort v2, v[2:3] ; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: s_mov_b32 s0, s4 -; GFX8-NEXT: s_mov_b32 s1, s5 +; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_u16_e32 v0, v2, v0 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: s_anyext_i16_i32: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v3, v1, s[8:9] -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: global_load_ushort v3, v1, s[2:3] +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, v2, v3 ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm entry: %tid.x = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -1009,22 +1009,22 @@ ; ; GFX89-LABEL: add_i64_varying: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX89-NEXT: s_mov_b32 s3, 0xf000 -; GFX89-NEXT: s_mov_b32 s2, -1 -; GFX89-NEXT: v_mov_b32_e32 v1, 0 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s0, s4 -; GFX89-NEXT: s_mov_b32 s1, s5 -; GFX89-NEXT: s_mov_b32 s4, s6 -; GFX89-NEXT: s_mov_b32 s5, s7 -; GFX89-NEXT: s_mov_b32 s6, s2 -; GFX89-NEXT: s_mov_b32 s7, s3 +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: v_mov_b32_e32 v1, 0 ; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX89-NEXT: buffer_atomic_add_x2 v[0:1], off, s[4:7], 0 glc +; GFX89-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc ; GFX89-NEXT: s_waitcnt vmcnt(0) ; GFX89-NEXT: buffer_wbinvl1_vol -; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX89-NEXT: s_endpgm ; ; GFX10-LABEL: add_i64_varying: @@ -2140,22 +2140,22 @@ ; ; GFX89-LABEL: sub_i64_varying: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX89-NEXT: s_mov_b32 s3, 0xf000 -; GFX89-NEXT: s_mov_b32 s2, -1 -; GFX89-NEXT: v_mov_b32_e32 v1, 0 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s0, s4 -; GFX89-NEXT: s_mov_b32 s1, s5 -; GFX89-NEXT: s_mov_b32 s4, s6 -; GFX89-NEXT: s_mov_b32 s5, s7 -; GFX89-NEXT: s_mov_b32 s6, s2 -; GFX89-NEXT: s_mov_b32 s7, s3 +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: v_mov_b32_e32 v1, 0 ; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX89-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 glc +; GFX89-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc ; GFX89-NEXT: s_waitcnt vmcnt(0) ; GFX89-NEXT: buffer_wbinvl1_vol -; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX89-NEXT: s_endpgm ; ; GFX10-LABEL: sub_i64_varying: diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll --- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll +++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll @@ -19,41 +19,41 @@ define amdgpu_kernel void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 { ; SI-LABEL: s_brev_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_brev_b32 s0, s0 -; SI-NEXT: s_lshr_b32 s0, s0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_brev_b32 s4, s4 +; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: s_brev_i16: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; FLAT-NEXT: s_load_dword s0, s[0:1], 0x2c -; FLAT-NEXT: s_mov_b32 s7, 0xf000 -; FLAT-NEXT: s_mov_b32 s6, -1 +; FLAT-NEXT: s_load_dword s4, s[0:1], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; FLAT-NEXT: s_mov_b32 s3, 0xf000 +; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: s_brev_b32 s0, s0 -; FLAT-NEXT: s_lshr_b32 s0, s0, 16 -; FLAT-NEXT: v_mov_b32_e32 v0, s0 -; FLAT-NEXT: buffer_store_short v0, off, s[4:7], 0 +; FLAT-NEXT: s_brev_b32 s4, s4 +; FLAT-NEXT: s_lshr_b32 s4, s4, 16 +; FLAT-NEXT: v_mov_b32_e32 v0, s4 +; FLAT-NEXT: buffer_store_short v0, off, s[0:3], 0 ; FLAT-NEXT: s_endpgm ; ; GISEL-LABEL: s_brev_i16: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GISEL-NEXT: s_load_dword s0, s[0:1], 0x2c +; GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-NEXT: s_and_b32 s0, s0, 0xffff -; GISEL-NEXT: s_brev_b32 s0, s0 -; GISEL-NEXT: s_lshr_b32 s0, s0, 16 -; GISEL-NEXT: v_mov_b32_e32 v2, s0 -; GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-NEXT: s_and_b32 s2, s2, 0xffff +; GISEL-NEXT: s_brev_b32 s2, s2 +; GISEL-NEXT: s_lshr_b32 s2, s2, 16 +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: flat_store_short v[0:1], v2 ; GISEL-NEXT: s_endpgm %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1 @@ -64,49 +64,51 @@ define amdgpu_kernel void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) #0 { ; SI-LABEL: v_brev_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s2, s6 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfrev_b32_e32 v0, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: v_brev_i16: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; FLAT-NEXT: s_mov_b32 s7, 0xf000 -; FLAT-NEXT: s_mov_b32 s6, -1 -; FLAT-NEXT: s_mov_b32 s2, s6 -; FLAT-NEXT: s_mov_b32 s3, s7 +; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; FLAT-NEXT: s_mov_b32 s3, 0xf000 +; FLAT-NEXT: s_mov_b32 s2, -1 +; FLAT-NEXT: s_mov_b32 s6, s2 +; FLAT-NEXT: s_mov_b32 s7, s3 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; FLAT-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; FLAT-NEXT: s_waitcnt vmcnt(0) ; FLAT-NEXT: v_bfrev_b32_e32 v0, v0 ; FLAT-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; FLAT-NEXT: buffer_store_short v0, off, s[4:7], 0 +; FLAT-NEXT: s_waitcnt lgkmcnt(0) +; FLAT-NEXT: buffer_store_short v0, off, s[0:3], 0 ; FLAT-NEXT: s_endpgm ; ; GISEL-LABEL: v_brev_i16: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-NEXT: v_mov_b32_e32 v1, s3 ; GISEL-NEXT: flat_load_ushort v0, v[0:1] ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_bfrev_b32_e32 v0, v0 ; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: flat_store_short v[0:1], v2 ; GISEL-NEXT: s_endpgm %val = load i16, i16 addrspace(1)* %valptr @@ -118,37 +120,37 @@ define amdgpu_kernel void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) #0 { ; SI-LABEL: s_brev_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_brev_b32 s0, s0 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_brev_b32 s4, s4 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: s_brev_i32: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; FLAT-NEXT: s_load_dword s0, s[0:1], 0x2c -; FLAT-NEXT: s_mov_b32 s7, 0xf000 -; FLAT-NEXT: s_mov_b32 s6, -1 +; FLAT-NEXT: s_load_dword s4, s[0:1], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; FLAT-NEXT: s_mov_b32 s3, 0xf000 +; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: s_brev_b32 s0, s0 -; FLAT-NEXT: v_mov_b32_e32 v0, s0 -; FLAT-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; FLAT-NEXT: s_brev_b32 s4, s4 +; FLAT-NEXT: v_mov_b32_e32 v0, s4 +; FLAT-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; FLAT-NEXT: s_endpgm ; ; GISEL-LABEL: s_brev_i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GISEL-NEXT: s_load_dword s0, s[0:1], 0x2c +; GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-NEXT: s_brev_b32 s0, s0 -; GISEL-NEXT: v_mov_b32_e32 v2, s0 -; GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-NEXT: s_brev_b32 s2, s2 +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: flat_store_dword v[0:1], v2 ; GISEL-NEXT: s_endpgm %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1 @@ -159,53 +161,54 @@ define amdgpu_kernel void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 { ; SI-LABEL: v_brev_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfrev_b32_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: v_brev_i32: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; FLAT-NEXT: s_mov_b32 s7, 0xf000 -; FLAT-NEXT: s_mov_b32 s6, -1 +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_mov_b32_e32 v1, s1 -; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; FLAT-NEXT: v_mov_b32_e32 v1, s3 +; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; FLAT-NEXT: flat_load_dword v0, v[0:1] +; FLAT-NEXT: s_mov_b32 s3, 0xf000 +; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: s_waitcnt vmcnt(0) ; FLAT-NEXT: v_bfrev_b32_e32 v0, v0 -; FLAT-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; FLAT-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; FLAT-NEXT: s_endpgm ; ; GISEL-LABEL: v_brev_i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-NEXT: v_mov_b32_e32 v1, s3 ; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: flat_load_dword v0, v[0:1] ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_bfrev_b32_e32 v2, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: flat_store_dword v[0:1], v2 ; GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -219,43 +222,43 @@ define amdgpu_kernel void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> %val) #0 { ; SI-LABEL: s_brev_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_brev_b32 s1, s1 -; SI-NEXT: s_brev_b32 s0, s0 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_brev_b32 s5, s5 +; SI-NEXT: s_brev_b32 s4, s4 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: s_brev_v2i32: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; FLAT-NEXT: s_mov_b32 s7, 0xf000 -; FLAT-NEXT: s_mov_b32 s6, -1 +; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; FLAT-NEXT: s_mov_b32 s3, 0xf000 +; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: s_brev_b32 s1, s1 -; FLAT-NEXT: s_brev_b32 s0, s0 -; FLAT-NEXT: v_mov_b32_e32 v0, s0 -; FLAT-NEXT: v_mov_b32_e32 v1, s1 -; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; FLAT-NEXT: s_brev_b32 s5, s5 +; FLAT-NEXT: s_brev_b32 s4, s4 +; FLAT-NEXT: v_mov_b32_e32 v0, s4 +; FLAT-NEXT: v_mov_b32_e32 v1, s5 +; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; FLAT-NEXT: s_endpgm ; ; GISEL-LABEL: s_brev_v2i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v2, s2 -; GISEL-NEXT: s_brev_b32 s0, s0 -; GISEL-NEXT: s_brev_b32 s1, s1 -; GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GISEL-NEXT: v_mov_b32_e32 v3, s3 +; GISEL-NEXT: s_brev_b32 s2, s2 +; GISEL-NEXT: s_brev_b32 s3, s3 +; GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-NEXT: s_endpgm %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1 @@ -266,53 +269,54 @@ define amdgpu_kernel void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 { ; SI-LABEL: v_brev_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfrev_b32_e32 v1, v1 ; SI-NEXT: v_bfrev_b32_e32 v0, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: v_brev_v2i32: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; FLAT-NEXT: s_mov_b32 s7, 0xf000 -; FLAT-NEXT: s_mov_b32 s6, -1 +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_mov_b32_e32 v1, s1 -; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; FLAT-NEXT: v_mov_b32_e32 v1, s3 +; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; FLAT-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; FLAT-NEXT: s_mov_b32 s3, 0xf000 +; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: s_waitcnt vmcnt(0) ; FLAT-NEXT: v_bfrev_b32_e32 v1, v1 ; FLAT-NEXT: v_bfrev_b32_e32 v0, v0 -; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; FLAT-NEXT: s_endpgm ; ; GISEL-LABEL: v_brev_v2i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-NEXT: v_mov_b32_e32 v1, s3 ; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GISEL-NEXT: v_mov_b32_e32 v2, s2 -; GISEL-NEXT: v_mov_b32_e32 v3, s3 +; GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_bfrev_b32_e32 v0, v0 ; GISEL-NEXT: v_bfrev_b32_e32 v1, v1 @@ -329,40 +333,40 @@ define amdgpu_kernel void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 { ; SI-LABEL: s_brev_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_brev_b64 s[0:1], s[0:1] -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_brev_b64 s[4:5], s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: s_brev_i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; FLAT-NEXT: s_mov_b32 s7, 0xf000 -; FLAT-NEXT: s_mov_b32 s6, -1 +; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; FLAT-NEXT: s_mov_b32 s3, 0xf000 +; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: s_brev_b64 s[0:1], s[0:1] -; FLAT-NEXT: v_mov_b32_e32 v0, s0 -; FLAT-NEXT: v_mov_b32_e32 v1, s1 -; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; FLAT-NEXT: s_brev_b64 s[4:5], s[4:5] +; FLAT-NEXT: v_mov_b32_e32 v0, s4 +; FLAT-NEXT: v_mov_b32_e32 v1, s5 +; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; FLAT-NEXT: s_endpgm ; ; GISEL-LABEL: s_brev_i64: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v2, s2 -; GISEL-NEXT: s_brev_b64 s[0:1], s[0:1] -; GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GISEL-NEXT: v_mov_b32_e32 v3, s3 +; GISEL-NEXT: s_brev_b64 s[2:3], s[2:3] +; GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-NEXT: s_endpgm %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1 @@ -373,53 +377,54 @@ define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 { ; SI-LABEL: v_brev_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfrev_b32_e32 v2, v0 ; SI-NEXT: v_bfrev_b32_e32 v1, v1 -; SI-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: v_brev_i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; FLAT-NEXT: s_mov_b32 s7, 0xf000 -; FLAT-NEXT: s_mov_b32 s6, -1 +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_mov_b32_e32 v1, s1 -; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; FLAT-NEXT: v_mov_b32_e32 v1, s3 +; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; FLAT-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; FLAT-NEXT: s_mov_b32 s3, 0xf000 +; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: s_waitcnt vmcnt(0) ; FLAT-NEXT: v_bfrev_b32_e32 v2, v0 ; FLAT-NEXT: v_bfrev_b32_e32 v1, v1 -; FLAT-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0 +; FLAT-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 ; FLAT-NEXT: s_endpgm ; ; GISEL-LABEL: v_brev_i64: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v4, s3 -; GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-NEXT: v_mov_b32_e32 v1, s3 ; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GISEL-NEXT: v_mov_b32_e32 v3, s2 +; GISEL-NEXT: v_mov_b32_e32 v4, s1 +; GISEL-NEXT: v_mov_b32_e32 v3, s0 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_bfrev_b32_e32 v1, v1 ; GISEL-NEXT: v_bfrev_b32_e32 v2, v0 @@ -436,49 +441,49 @@ define amdgpu_kernel void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %val) #0 { ; SI-LABEL: s_brev_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_brev_b64 s[2:3], s[2:3] -; SI-NEXT: s_brev_b64 s[0:1], s[0:1] -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_mov_b32_e32 v3, s3 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_brev_b64 s[6:7], s[6:7] +; SI-NEXT: s_brev_b64 s[4:5], s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: s_brev_v2i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 -; FLAT-NEXT: s_mov_b32 s7, 0xf000 -; FLAT-NEXT: s_mov_b32 s6, -1 +; FLAT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; FLAT-NEXT: s_mov_b32 s3, 0xf000 +; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: s_brev_b64 s[2:3], s[2:3] -; FLAT-NEXT: s_brev_b64 s[0:1], s[0:1] -; FLAT-NEXT: v_mov_b32_e32 v0, s0 -; FLAT-NEXT: v_mov_b32_e32 v1, s1 -; FLAT-NEXT: v_mov_b32_e32 v2, s2 -; FLAT-NEXT: v_mov_b32_e32 v3, s3 -; FLAT-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; FLAT-NEXT: s_brev_b64 s[6:7], s[6:7] +; FLAT-NEXT: s_brev_b64 s[4:5], s[4:5] +; FLAT-NEXT: v_mov_b32_e32 v0, s4 +; FLAT-NEXT: v_mov_b32_e32 v1, s5 +; FLAT-NEXT: v_mov_b32_e32 v2, s6 +; FLAT-NEXT: v_mov_b32_e32 v3, s7 +; FLAT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; FLAT-NEXT: s_endpgm ; ; GISEL-LABEL: s_brev_v2i64: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v4, s4 -; GISEL-NEXT: s_brev_b64 s[0:1], s[0:1] -; GISEL-NEXT: s_brev_b64 s[2:3], s[2:3] +; GISEL-NEXT: s_brev_b64 s[0:1], s[4:5] +; GISEL-NEXT: s_brev_b64 s[2:3], s[6:7] ; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: v_mov_b32_e32 v4, s8 ; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GISEL-NEXT: v_mov_b32_e32 v3, s3 -; GISEL-NEXT: v_mov_b32_e32 v5, s5 +; GISEL-NEXT: v_mov_b32_e32 v5, s9 ; GISEL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GISEL-NEXT: s_endpgm %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1 @@ -489,62 +494,63 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 { ; SI-LABEL: v_brev_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfrev_b32_e32 v4, v2 ; SI-NEXT: v_bfrev_b32_e32 v3, v3 ; SI-NEXT: v_bfrev_b32_e32 v2, v0 ; SI-NEXT: v_bfrev_b32_e32 v1, v1 -; SI-NEXT: buffer_store_dwordx4 v[1:4], off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: v_brev_v2i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; FLAT-NEXT: s_mov_b32 s7, 0xf000 -; FLAT-NEXT: s_mov_b32 s6, -1 +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_mov_b32_e32 v1, s1 -; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; FLAT-NEXT: v_mov_b32_e32 v1, s3 +; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; FLAT-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; FLAT-NEXT: s_mov_b32 s3, 0xf000 +; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: s_waitcnt vmcnt(0) ; FLAT-NEXT: v_bfrev_b32_e32 v4, v2 ; FLAT-NEXT: v_bfrev_b32_e32 v3, v3 ; FLAT-NEXT: v_bfrev_b32_e32 v2, v0 ; FLAT-NEXT: v_bfrev_b32_e32 v1, v1 -; FLAT-NEXT: buffer_store_dwordx4 v[1:4], off, s[4:7], 0 +; FLAT-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 ; FLAT-NEXT: s_endpgm ; ; GISEL-LABEL: v_brev_v2i64: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-NEXT: v_mov_b32_e32 v1, s3 ; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_bfrev_b32_e32 v4, v1 ; GISEL-NEXT: v_bfrev_b32_e32 v5, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: v_bfrev_b32_e32 v6, v3 ; GISEL-NEXT: v_bfrev_b32_e32 v7, v2 -; GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx10.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx10.ll --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx10.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx10.ll @@ -5,8 +5,8 @@ ; instruction, rather than 8 in previous generations. ; GCN-LABEL: {{^}}long_forward_branch_gfx10only: -; GFX9: s_cmp_eq_u32 ; GFX9: s_load_dwordx2 +; GFX9: s_cmp_eq_u32 ; GFX9-NEXT: s_cbranch_scc1 ; GFX10: s_cmp_eq_u32 diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -391,7 +391,6 @@ ; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} ; GCN-NEXT: [[IF]]: ; %if -; GCN: buffer_store_dword ; GCN: s_cmp_lg_u32 ; GCN: s_cbranch_scc1 [[ENDIF:.LBB[0-9]+_[0-9]+]] diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll --- a/llvm/test/CodeGen/AMDGPU/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/bswap.ll @@ -38,11 +38,11 @@ ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_load_dword s0, s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_perm_b32 v0, 0, s0, v0 +; VI-NEXT: v_perm_b32 v0, 0, s2, v0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %val = load i32, i32 addrspace(1)* %in, align 4 @@ -118,20 +118,20 @@ ; ; VI-LABEL: test_bswap_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x10203 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_perm_b32 v3, 0, s11, v0 ; VI-NEXT: v_perm_b32 v2, 0, s10, v0 ; VI-NEXT: v_perm_b32 v1, 0, s9, v0 ; VI-NEXT: v_perm_b32 v0, 0, s8, v0 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16 %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %val) nounwind readnone @@ -179,14 +179,14 @@ ; ; VI-LABEL: test_bswap_v8i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v4, 0x10203 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s15, 0xf000 +; VI-NEXT: s_mov_b32 s14, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s0 -; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x0 +; VI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; VI-NEXT: s_mov_b32 s12, s8 +; VI-NEXT: s_mov_b32 s13, s9 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_perm_b32 v3, 0, s3, v4 ; VI-NEXT: v_perm_b32 v2, 0, s2, v4 @@ -196,8 +196,8 @@ ; VI-NEXT: v_perm_b32 v6, 0, s6, v4 ; VI-NEXT: v_perm_b32 v5, 0, s5, v4 ; VI-NEXT: v_perm_b32 v4, 0, s4, v4 -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; VI-NEXT: s_endpgm %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32 %bswap = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %val) nounwind readnone @@ -272,20 +272,20 @@ ; ; VI-LABEL: test_bswap_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x10203 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_perm_b32 v3, 0, s10, v0 ; VI-NEXT: v_perm_b32 v2, 0, s11, v0 ; VI-NEXT: v_perm_b32 v1, 0, s8, v0 ; VI-NEXT: v_perm_b32 v0, 0, s9, v0 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16 %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %val) nounwind readnone @@ -333,14 +333,14 @@ ; ; VI-LABEL: test_bswap_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v4, 0x10203 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s15, 0xf000 +; VI-NEXT: s_mov_b32 s14, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s0 -; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x0 +; VI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; VI-NEXT: s_mov_b32 s12, s8 +; VI-NEXT: s_mov_b32 s13, s9 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_perm_b32 v3, 0, s2, v4 ; VI-NEXT: v_perm_b32 v2, 0, s3, v4 @@ -350,8 +350,8 @@ ; VI-NEXT: v_perm_b32 v6, 0, s7, v4 ; VI-NEXT: v_perm_b32 v5, 0, s4, v4 ; VI-NEXT: v_perm_b32 v4, 0, s5, v4 -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; VI-NEXT: s_endpgm %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32 %bswap = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %val) nounwind readnone diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll --- a/llvm/test/CodeGen/AMDGPU/build_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll @@ -66,13 +66,13 @@ ; GFX6: s_mov_b32 s2, -1 ; GFX6: v_mov_b32_e32 v0, s4 ; GFX6: buffer_store_dword v0, off, s[0:3], 0 -; GFX8: s_mov_b32 s7, 0xf000 -; GFX8: s_mov_b32 s6, -1 +; GFX8: s_mov_b32 s3, 0xf000 +; GFX8: s_mov_b32 s2, -1 ; GFX8: s_waitcnt lgkmcnt(0) -; GFX8: s_lshr_b32 s0, s0, 16 -; GFX8: s_or_b32 s0, s0, 0x50000 -; GFX8: v_mov_b32_e32 v0, s0 -; GFX8: buffer_store_dword v0, off, s[4:7], 0 +; GFX8: s_lshr_b32 s4, s4, 16 +; GFX8: s_or_b32 s4, s4, 0x50000 +; GFX8: v_mov_b32_e32 v0, s4 +; GFX8: buffer_store_dword v0, off, s[0:3], 0 ; GFX10: v_mov_b32_e32 v0, 0 ; GFX10: s_waitcnt lgkmcnt(0) ; GFX10: s_lshr_b32 s2, s2, 16 diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -654,11 +654,11 @@ ; HSA-DAG: buffer_store_byte [[VAL0]], off, s[0:3], 0 offset:8 ; HSA-DAG: buffer_store_dword [[VAL1]], off, s[0:3], 0 offset:12 -; HSA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[0:3], 0 offset:8 ; HSA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[0:3], 0 offset:12 +; HSA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[0:3], 0 offset:8 -; MESA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[36:39], 0 offset:8 ; MESA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[36:39], 0 offset:12 +; MESA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[36:39], 0 offset:8 ; GCN-DAG: s_movk_i32 [[SP:s[0-9]+]], 0x400{{$}} @@ -757,11 +757,11 @@ ; GCN-LABEL: {{^}}tail_call_stack_passed_arg_alignment_v32i32_f64: ; GCN-NOT: s32 -; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN: buffer_load_dword v33, off, s[0:3], s32{{$}} +; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}} +; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; GCN: s_getpc_b64 -; GCN: buffer_store_dword v33, off, s[0:3], s32{{$}} -; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; GCN: buffer_store_dword v32, off, s[0:3], s32{{$}} +; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; GCN-NOT: s32 ; GCN: s_setpc_b64 define void @tail_call_stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -566,7 +566,7 @@ ; FIXEDABI: v_and_b32_e32 v31, 0x3ff, v31 ; FIXEDABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v31 -; FIXEDABI: buffer_load_dword v0, off, s[0:3], s32{{$}} +; FIXEDABI: buffer_load_dword v31, off, s[0:3], s32{{$}} ; FIXEDABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; FIXEDABI: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc{{$}} ; FIXEDABI: s_setpc_b64 diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -224,10 +224,10 @@ ; GCN-NEXT: v_mov_b32_e32 v0, 2 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: global_load_ushort v0, v[0:1], off +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_load_short_d16_hi v0, v[2:3], off +; GCN-NEXT: global_load_short_d16_hi v0, v[1:2], off ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -292,10 +292,10 @@ ; GCN-NEXT: v_mov_b32_e32 v0, 2 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: flat_load_ushort v0, v[0:1] +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: flat_load_short_d16_hi v0, v[2:3] +; GCN-NEXT: flat_load_short_d16_hi v0, v[1:2] ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -374,14 +374,14 @@ ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:8 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4 -; GFX900-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:6 +; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:6 +; GFX900-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:4 ; GFX900-NEXT: s_waitcnt vmcnt(1) -; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 ; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:8 -; GFX900-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX900-NEXT: s_waitcnt vmcnt(1) +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX900-NEXT: v_lshl_or_b32 v0, v0, 16, v3 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX900-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll --- a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll +++ b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll @@ -15,7 +15,7 @@ ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1 ; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3 -; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 6 +; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 8 ; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0 ; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel @@ -33,7 +33,7 @@ ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1 ; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3 -; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 6 +; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 8 ; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0 ; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel diff --git a/llvm/test/CodeGen/AMDGPU/commute-shifts.ll b/llvm/test/CodeGen/AMDGPU/commute-shifts.ll --- a/llvm/test/CodeGen/AMDGPU/commute-shifts.ll +++ b/llvm/test/CodeGen/AMDGPU/commute-shifts.ll @@ -5,7 +5,6 @@ define amdgpu_ps float @main(float %arg0, float %arg1) #0 { ; SI-LABEL: main: ; SI: ; %bb.0: ; %bb -; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s1, s0 ; SI-NEXT: s_mov_b32 s2, s0 @@ -15,6 +14,7 @@ ; SI-NEXT: s_mov_b32 s6, s0 ; SI-NEXT: s_mov_b32 s7, s0 ; SI-NEXT: image_load v2, v0, s[0:7] dmask:0x1 unorm +; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 ; SI-NEXT: v_and_b32_e32 v0, 7, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v0, v0, v2 @@ -26,7 +26,6 @@ ; ; VI-LABEL: main: ; VI: ; %bb.0: ; %bb -; VI-NEXT: v_cvt_i32_f32_e32 v0, v0 ; VI-NEXT: s_mov_b32 s0, 0 ; VI-NEXT: s_mov_b32 s1, s0 ; VI-NEXT: s_mov_b32 s2, s0 @@ -36,6 +35,7 @@ ; VI-NEXT: s_mov_b32 s6, s0 ; VI-NEXT: s_mov_b32 s7, s0 ; VI-NEXT: image_load v2, v0, s[0:7] dmask:0x1 unorm +; VI-NEXT: v_cvt_i32_f32_e32 v0, v0 ; VI-NEXT: v_and_b32_e32 v0, 7, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v0, v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll --- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -8,35 +8,33 @@ define amdgpu_kernel void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { ; SI-LABEL: test_copy_v4i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -50,46 +48,49 @@ define amdgpu_kernel void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { ; SI-LABEL: test_copy_v4i8_x2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_x2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s11, s3 -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() @@ -240,23 +241,25 @@ define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { ; SI-LABEL: test_copy_v4i8_extra_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s12, 0xff00 ; SI-NEXT: s_movk_i32 s13, 0xff -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0 @@ -272,31 +275,32 @@ ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v1, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_extra_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_movk_i32 s12, 0xff00 ; VI-NEXT: s_movk_i32 s13, 0xff ; VI-NEXT: s_movk_i32 s14, 0x900 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; VI-NEXT: v_and_b32_e32 v4, s12, v1 @@ -311,7 +315,7 @@ ; VI-NEXT: v_add_u16_e32 v2, s14, v2 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v1, v2, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: buffer_store_dword v1, off, s[8:11], 0 ; VI-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() @@ -327,27 +331,26 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { ; SI-LABEL: test_copy_v4i8_x2_extra_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 -; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[12:13], s[10:11] +; SI-NEXT: s_mov_b64 s[12:13], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 ; SI-NEXT: s_mov_b32 s16, 0xff00 ; SI-NEXT: s_movk_i32 s17, 0xff -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s14, s2 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0 ; SI-NEXT: v_and_b32_e32 v2, s16, v0 @@ -362,8 +365,9 @@ ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1 -; SI-NEXT: buffer_store_dword v1, off, s[12:15], 0 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v1, off, s[12:15], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_x2_extra_use: @@ -389,7 +393,6 @@ ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; VI-NEXT: v_and_b32_e32 v4, s16, v1 ; VI-NEXT: v_add_u16_e32 v1, 9, v1 @@ -403,6 +406,7 @@ ; VI-NEXT: v_add_u16_e32 v2, s18, v2 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v1, v2, v1 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_store_dword v1, off, s[12:15], 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -419,22 +423,22 @@ define amdgpu_kernel void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { ; SI-LABEL: test_copy_v3i8_align4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[4:5], s[10:11] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s0, s8 -; SI-NEXT: s_mov_b32 s1, s9 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 -; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v3i8_align4: @@ -463,40 +467,42 @@ define amdgpu_kernel void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { ; SI-LABEL: test_copy_v3i8_align2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:2 +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_short v1, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v3i8_align2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2 -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:2 +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_short v1, off, s[4:7], 0 ; VI-NEXT: s_endpgm %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 2 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 2 @@ -506,48 +512,48 @@ define amdgpu_kernel void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { ; SI-LABEL: test_copy_v3i8_align1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 ; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:1 ; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:1 +; SI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:1 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_byte v2, off, s[0:3], 0 offset:2 +; SI-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:2 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v3i8_align1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 ; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:1 ; VI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:1 +; VI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:1 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_byte v2, off, s[0:3], 0 offset:2 +; VI-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:2 ; VI-NEXT: s_endpgm %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 1 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 1 @@ -557,36 +563,36 @@ define amdgpu_kernel void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { ; SI-LABEL: test_copy_v4i8_volatile_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_volatile_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %val = load volatile <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 @@ -596,55 +602,55 @@ define amdgpu_kernel void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { ; SI-LABEL: test_copy_v4i8_volatile_store: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 -; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:1 -; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 -; SI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:3 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_byte v3, off, s[0:3], 0 offset:3 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:3 +; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2 +; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:1 +; SI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_byte v2, off, s[0:3], 0 offset:2 +; SI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:1 +; SI-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_byte v3, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_volatile_store: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 -; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:1 -; VI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 -; VI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:3 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v3, off, s[0:3], 0 offset:3 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:3 +; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2 +; VI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:1 +; VI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v2, off, s[0:3], 0 offset:2 +; VI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:1 +; VI-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_byte v3, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll b/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll --- a/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll +++ b/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll @@ -4,23 +4,25 @@ define amdgpu_kernel void @copy_to_scc(i32 addrspace(1)* %out, i32 addrspace(1)* %in, <4 x i32> addrspace(4)* %addrSrc) { ; GCN-LABEL: copy_to_scc: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GCN-NEXT: s_load_dword s10, s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:252 -; GCN-NEXT: s_cmp_lg_u32 s10, 0 -; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:252 +; GCN-NEXT: s_load_dword s2, s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s2, 0 +; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: s_xor_b64 s[0:1], s[6:7], vcc -; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN-NEXT: s_cselect_b32 s0, 2, 3 -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: global_store_dword v0, v1, s[4:5] +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_xor_b64 s[2:3], s[2:3], vcc +; GCN-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN-NEXT: s_cselect_b32 s2, 2, 3 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: global_store_dword v1, v0, s[0:1] ; GCN-NEXT: s_endpgm entry: ; preds = %1009 %0 = load i32, i32 addrspace(1)* %in, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -35,15 +35,15 @@ ; ; VI-LABEL: s_ctlz_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_flbit_i32_b32 s0, s0 -; VI-NEXT: s_min_u32 s0, s0, 32 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_flbit_i32_b32 s4, s4 +; VI-NEXT: s_min_u32 s4, s4, 32 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_ctlz_i32: @@ -110,20 +110,20 @@ ; ; VI-LABEL: v_ctlz_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v0 ; VI-NEXT: v_min_u32_e32 v0, 32, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_i32: @@ -205,22 +205,22 @@ ; ; VI-LABEL: v_ctlz_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v1, v1 ; VI-NEXT: v_ffbh_u32_e32 v0, v0 ; VI-NEXT: v_min_u32_e32 v1, 32, v1 ; VI-NEXT: v_min_u32_e32 v0, 32, v0 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_v2i32: @@ -313,16 +313,16 @@ ; ; VI-LABEL: v_ctlz_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v3, v3 ; VI-NEXT: v_ffbh_u32_e32 v2, v2 @@ -332,7 +332,7 @@ ; VI-NEXT: v_min_u32_e32 v2, 32, v2 ; VI-NEXT: v_min_u32_e32 v1, 32, v1 ; VI-NEXT: v_min_u32_e32 v0, 32, v0 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_v4i32: @@ -432,20 +432,21 @@ ; ; VI-LABEL: v_ctlz_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s2, s6 -; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; VI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v0 ; VI-NEXT: v_min_u32_e32 v0, 32, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0 ; VI-NEXT: v_add_u16_e32 v0, -8, v0 -; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_i8: @@ -532,17 +533,17 @@ ; ; VI-LABEL: s_ctlz_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_flbit_i32_b32 s0, s0 -; VI-NEXT: v_add_u32_e64 v0, s[2:3], s0, 32 clamp -; VI-NEXT: s_flbit_i32_b32 s0, s1 -; VI-NEXT: v_min3_u32 v0, v0, s0, 64 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_flbit_i32_b32 s4, s4 +; VI-NEXT: v_add_u32_e64 v0, s[6:7], s4, 32 clamp +; VI-NEXT: s_flbit_i32_b32 s4, s5 +; VI-NEXT: v_min3_u32 v0, v0, s4, 64 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_ctlz_i64: @@ -615,16 +616,16 @@ ; ; VI-LABEL: s_ctlz_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_flbit_i32_b32 s0, s0 -; VI-NEXT: v_add_u32_e64 v0, s[2:3], s0, 32 clamp -; VI-NEXT: s_flbit_i32_b32 s0, s1 -; VI-NEXT: v_min3_u32 v0, v0, s0, 64 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_flbit_i32_b32 s4, s4 +; VI-NEXT: v_add_u32_e64 v0, s[6:7], s4, 32 clamp +; VI-NEXT: s_flbit_i32_b32 s4, s5 +; VI-NEXT: v_min3_u32 v0, v0, s4, 64 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_ctlz_i64_trunc: @@ -700,17 +701,17 @@ ; ; VI-LABEL: v_ctlz_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s3 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v3 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v3 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v3 +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v3 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v0 @@ -812,17 +813,17 @@ ; ; VI-LABEL: v_ctlz_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s3 -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] -; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v4, s1 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v1 @@ -922,19 +923,19 @@ ; ; VI-LABEL: v_ctlz_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_i32_sel_eq_neg1: @@ -1018,19 +1019,19 @@ ; ; VI-LABEL: v_ctlz_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_i32_sel_ne_neg1: @@ -1118,22 +1119,22 @@ ; ; VI-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v0 ; VI-NEXT: v_min_u32_e32 v0, 32, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_i32_sel_eq_bitwidth: @@ -1225,22 +1226,22 @@ ; ; VI-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v0 ; VI-NEXT: v_min_u32_e32 v0, 32, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_i32_sel_ne_bitwidth: @@ -1328,18 +1329,18 @@ ; ; VI-LABEL: v_ctlz_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v0 -; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_i8_sel_eq_neg1: @@ -1430,14 +1431,14 @@ ; ; VI-LABEL: v_ctlz_i16_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s2, s6 -; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0xffff ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v2, v0 @@ -1445,7 +1446,8 @@ ; VI-NEXT: v_add_u32_e32 v2, vcc, -16, v2 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_i16_sel_eq_neg1: @@ -1539,19 +1541,19 @@ ; ; VI-LABEL: v_ctlz_i7_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v0 ; VI-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_i7_sel_eq_neg1: diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -31,13 +31,13 @@ ; ; VI-LABEL: s_ctlz_zero_undef_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_flbit_i32_b32 s0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_flbit_i32_b32 s2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -54,8 +54,8 @@ ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s4 @@ -88,18 +88,18 @@ ; ; VI-LABEL: v_ctlz_zero_undef_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -124,15 +124,15 @@ ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[4:5] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid @@ -164,16 +164,16 @@ ; ; VI-LABEL: v_ctlz_zero_undef_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v1, v1 ; VI-NEXT: v_ffbh_u32_e32 v0, v0 @@ -202,16 +202,16 @@ ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[4:5] +; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid @@ -245,16 +245,16 @@ ; ; VI-LABEL: v_ctlz_zero_undef_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v3, v3 ; VI-NEXT: v_ffbh_u32_e32 v2, v2 @@ -287,18 +287,18 @@ ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[4:5] +; GFX9-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v3, v3 -; GFX9-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX9-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid @@ -329,19 +329,19 @@ ; ; VI-LABEL: v_ctlz_zero_undef_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0 ; VI-NEXT: v_add_u16_e32 v2, -8, v0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -376,12 +376,12 @@ ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0 ; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc ; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off @@ -389,7 +389,7 @@ ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, 24, v0 -; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid @@ -418,17 +418,17 @@ ; ; VI-LABEL: s_ctlz_zero_undef_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: s_flbit_i32_b32 s0, s0 -; VI-NEXT: s_flbit_i32_b32 s1, s1 -; VI-NEXT: s_add_i32 s0, s0, 32 -; VI-NEXT: s_min_u32 s0, s0, s1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_flbit_i32_b32 s2, s2 +; VI-NEXT: s_flbit_i32_b32 s3, s3 +; VI-NEXT: s_add_i32 s2, s2, 32 +; VI-NEXT: s_min_u32 s2, s2, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -450,15 +450,15 @@ ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_flbit_i32_b64 s0, s[4:5] +; GFX9-GISEL-NEXT: s_flbit_i32_b64 s0, s[2:3] ; GFX9-GISEL-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true) store i64 %ctlz, i64 addrspace(1)* %out @@ -483,16 +483,16 @@ ; ; VI-LABEL: s_ctlz_zero_undef_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_flbit_i32_b32 s0, s0 -; VI-NEXT: s_flbit_i32_b32 s1, s1 -; VI-NEXT: s_add_i32 s0, s0, 32 -; VI-NEXT: s_min_u32 s0, s0, s1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_flbit_i32_b32 s2, s2 +; VI-NEXT: s_flbit_i32_b32 s3, s3 +; VI-NEXT: s_add_i32 s2, s2, 32 +; VI-NEXT: s_min_u32 s2, s2, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -513,13 +513,13 @@ ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_trunc: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_flbit_i32_b64 s0, s[4:5] +; GFX9-GISEL-NEXT: s_flbit_i32_b64 s0, s[2:3] ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true) %trunc = trunc i64 %ctlz to i32 @@ -550,17 +550,17 @@ ; ; VI-LABEL: v_ctlz_zero_undef_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s3 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v3 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v3 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v3 +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v3 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v0 @@ -597,18 +597,18 @@ ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 32, v0 ; GFX9-GISEL-NEXT: v_min_u32_e32 v0, v1, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid @@ -642,17 +642,17 @@ ; ; VI-LABEL: v_ctlz_zero_undef_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s3 -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] -; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v1 @@ -689,18 +689,18 @@ ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_trunc: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[4:5] +; GFX9-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 ; GFX9-GISEL-NEXT: v_add_u32_e32 v1, 32, v1 ; GFX9-GISEL-NEXT: v_min_u32_e32 v1, v2, v1 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid @@ -733,18 +733,18 @@ ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -770,17 +770,17 @@ ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[4:5] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid @@ -813,18 +813,18 @@ ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -850,17 +850,17 @@ ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[4:5] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid @@ -892,17 +892,17 @@ ; ; VI-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -935,12 +935,12 @@ ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0 ; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc ; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off @@ -950,7 +950,7 @@ ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid @@ -988,16 +988,16 @@ ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v2, v[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v3, v2 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 @@ -1038,18 +1038,18 @@ ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[4:5] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v0 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -1089,20 +1089,20 @@ ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v1, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1128,17 +1128,17 @@ ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[4:5] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid @@ -1174,20 +1174,20 @@ ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v1, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1213,17 +1213,17 @@ ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[4:5] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid @@ -1259,20 +1259,20 @@ ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v1, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1299,17 +1299,17 @@ ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[4:5] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid @@ -1345,20 +1345,20 @@ ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v1, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1385,17 +1385,17 @@ ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[4:5] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll --- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll @@ -14,28 +14,28 @@ define amdgpu_kernel void @s_ctpop_i16(i16 addrspace(1)* noalias %out, i16 %val) nounwind { ; SI-LABEL: s_ctpop_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s0, s0, 0xffff -; SI-NEXT: s_bcnt1_i32_b32 s0, s0 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_bcnt1_i32_b32 s4, s4 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_ctpop_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s0, s0, 0xffff -; VI-NEXT: s_bcnt1_i32_b32 s0, s0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_bcnt1_i32_b32 s4, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_ctpop_i16: @@ -72,36 +72,37 @@ define amdgpu_kernel void @v_ctpop_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: v_ctpop_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctpop_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0 -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctpop_i16: @@ -141,48 +142,49 @@ define amdgpu_kernel void @v_ctpop_add_chain_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in0, i16 addrspace(1)* noalias %in1) nounwind { ; SI-LABEL: v_ctpop_add_chain_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_mov_b64 s[10:11], s[6:7] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0 ; SI-NEXT: v_bcnt_u32_b32_e32 v0, v2, v0 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctpop_add_chain_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_ushort v1, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0 ; VI-NEXT: v_bcnt_u32_b32 v0, v0, v1 -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctpop_add_chain_i16: @@ -237,38 +239,39 @@ define amdgpu_kernel void @v_ctpop_add_sgpr_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in, i16 %sval) nounwind { ; SI-LABEL: v_ctpop_add_sgpr_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_load_dword s0, s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, s0 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, s8 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctpop_add_sgpr_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_bcnt_u32_b32 v0, v0, s0 -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: v_bcnt_u32_b32 v0, v0, s4 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctpop_add_sgpr_i16: @@ -317,16 +320,16 @@ define amdgpu_kernel void @v_ctpop_v2i16(<2 x i16> addrspace(1)* noalias %out, <2 x i16> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: v_ctpop_v2i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -334,21 +337,22 @@ ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_bcnt_u32_b32_e64 v1, v1, 0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctpop_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -356,7 +360,7 @@ ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctpop_v2i16: @@ -396,21 +400,21 @@ define amdgpu_kernel void @v_ctpop_v4i16(<4 x i16> addrspace(1)* noalias %out, <4 x i16> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: v_ctpop_v4i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s0, 0xffff -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s4, 0xffff +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, s0, v0 +; SI-NEXT: v_and_b32_e32 v2, s4, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v3, s0, v1 +; SI-NEXT: v_and_b32_e32 v3, s4, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_bcnt_u32_b32_e64 v1, v1, 0 ; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0 @@ -420,27 +424,28 @@ ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctpop_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s4, 0xffff ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_mov_b32 s0, 0xffff +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; VI-NEXT: v_and_b32_e32 v1, s0, v1 -; VI-NEXT: v_and_b32_e32 v0, s0, v0 +; VI-NEXT: v_and_b32_e32 v1, s4, v1 +; VI-NEXT: v_and_b32_e32 v0, s4, v0 ; VI-NEXT: v_bcnt_u32_b32 v2, v2, 0 ; VI-NEXT: v_bcnt_u32_b32 v3, v3, 0 ; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0 @@ -449,7 +454,7 @@ ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_or_b32_e32 v0, v0, v3 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctpop_v4i16: @@ -521,25 +526,25 @@ define amdgpu_kernel void @v_ctpop_v8i16(<8 x i16> addrspace(1)* noalias %out, <8 x i16> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: v_ctpop_v8i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s0, 0xffff -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s4, 0xffff +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, s0, v0 +; SI-NEXT: v_and_b32_e32 v4, s4, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v5, s0, v1 +; SI-NEXT: v_and_b32_e32 v5, s4, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v6, s0, v2 +; SI-NEXT: v_and_b32_e32 v6, s4, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v7, s0, v3 +; SI-NEXT: v_and_b32_e32 v7, s4, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_bcnt_u32_b32_e64 v3, v3, 0 ; SI-NEXT: v_bcnt_u32_b32_e64 v2, v2, 0 @@ -557,31 +562,32 @@ ; SI-NEXT: v_or_b32_e32 v2, v6, v2 ; SI-NEXT: v_or_b32_e32 v1, v5, v1 ; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctpop_v8i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s4, 0xffff ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; VI-NEXT: s_mov_b32 s0, 0xffff +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; VI-NEXT: v_and_b32_e32 v3, s0, v3 -; VI-NEXT: v_and_b32_e32 v2, s0, v2 -; VI-NEXT: v_and_b32_e32 v1, s0, v1 -; VI-NEXT: v_and_b32_e32 v0, s0, v0 +; VI-NEXT: v_and_b32_e32 v3, s4, v3 +; VI-NEXT: v_and_b32_e32 v2, s4, v2 +; VI-NEXT: v_and_b32_e32 v1, s4, v1 +; VI-NEXT: v_and_b32_e32 v0, s4, v0 ; VI-NEXT: v_bcnt_u32_b32 v4, v4, 0 ; VI-NEXT: v_bcnt_u32_b32 v5, v5, 0 ; VI-NEXT: v_bcnt_u32_b32 v6, v6, 0 @@ -598,7 +604,7 @@ ; VI-NEXT: v_or_b32_e32 v2, v2, v5 ; VI-NEXT: v_or_b32_e32 v1, v1, v6 ; VI-NEXT: v_or_b32_e32 v0, v0, v7 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctpop_v8i16: @@ -702,36 +708,36 @@ define amdgpu_kernel void @v_ctpop_v16i16(<16 x i16> addrspace(1)* noalias %out, <16 x i16> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: v_ctpop_v16i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 5, v0 ; SI-NEXT: v_mov_b32_e32 v5, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_dwordx4 v[4:7], v[4:5], s[0:3], 0 addr64 offset:16 -; SI-NEXT: s_mov_b32 s0, 0xffff -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_load_dwordx4 v[4:7], v[4:5], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s4, 0xffff +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v12, s0, v0 +; SI-NEXT: v_and_b32_e32 v8, s4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v9, s4, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v10, s4, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v11, s4, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v8, s0, v4 +; SI-NEXT: v_and_b32_e32 v12, s4, v4 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v9, s0, v5 +; SI-NEXT: v_and_b32_e32 v13, s4, v5 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v10, s0, v6 +; SI-NEXT: v_and_b32_e32 v14, s4, v6 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v11, s0, v7 +; SI-NEXT: v_and_b32_e32 v15, s4, v7 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v13, s0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v14, s0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v15, s0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_bcnt_u32_b32_e64 v7, v7, 0 ; SI-NEXT: v_bcnt_u32_b32_e64 v6, v6, 0 ; SI-NEXT: v_bcnt_u32_b32_e64 v5, v5, 0 @@ -740,6 +746,10 @@ ; SI-NEXT: v_bcnt_u32_b32_e64 v2, v2, 0 ; SI-NEXT: v_bcnt_u32_b32_e64 v1, v1, 0 ; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0 +; SI-NEXT: v_bcnt_u32_b32_e64 v15, v15, 0 +; SI-NEXT: v_bcnt_u32_b32_e64 v14, v14, 0 +; SI-NEXT: v_bcnt_u32_b32_e64 v13, v13, 0 +; SI-NEXT: v_bcnt_u32_b32_e64 v12, v12, 0 ; SI-NEXT: v_bcnt_u32_b32_e64 v11, v11, 0 ; SI-NEXT: v_bcnt_u32_b32_e64 v10, v10, 0 ; SI-NEXT: v_bcnt_u32_b32_e64 v9, v9, 0 @@ -748,51 +758,48 @@ ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_bcnt_u32_b32_e64 v15, v15, 0 -; SI-NEXT: v_bcnt_u32_b32_e64 v14, v14, 0 -; SI-NEXT: v_bcnt_u32_b32_e64 v13, v13, 0 -; SI-NEXT: v_bcnt_u32_b32_e64 v12, v12, 0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v7, v11, v7 -; SI-NEXT: v_or_b32_e32 v6, v10, v6 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: v_or_b32_e32 v4, v8, v4 -; SI-NEXT: v_or_b32_e32 v3, v15, v3 -; SI-NEXT: v_or_b32_e32 v2, v14, v2 -; SI-NEXT: v_or_b32_e32 v1, v13, v1 -; SI-NEXT: v_or_b32_e32 v0, v12, v0 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v0 +; SI-NEXT: v_or_b32_e32 v3, v15, v7 +; SI-NEXT: v_or_b32_e32 v2, v14, v6 +; SI-NEXT: v_or_b32_e32 v1, v13, v5 +; SI-NEXT: v_or_b32_e32 v0, v12, v4 +; SI-NEXT: v_or_b32_e32 v7, v11, v16 +; SI-NEXT: v_or_b32_e32 v6, v10, v17 +; SI-NEXT: v_or_b32_e32 v5, v9, v18 +; SI-NEXT: v_or_b32_e32 v4, v8, v19 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctpop_v16i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s4, 0xffff ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; VI-NEXT: flat_load_dwordx4 v[0:3], v[4:5] +; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; VI-NEXT: s_mov_b32 s0, 0xffff +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 ; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; VI-NEXT: v_and_b32_e32 v3, s0, v3 -; VI-NEXT: v_and_b32_e32 v2, s0, v2 -; VI-NEXT: v_and_b32_e32 v1, s0, v1 -; VI-NEXT: v_and_b32_e32 v0, s0, v0 +; VI-NEXT: v_and_b32_e32 v3, s4, v3 +; VI-NEXT: v_and_b32_e32 v2, s4, v2 +; VI-NEXT: v_and_b32_e32 v1, s4, v1 +; VI-NEXT: v_and_b32_e32 v0, s4, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 ; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 @@ -802,10 +809,10 @@ ; VI-NEXT: v_bcnt_u32_b32 v9, v9, 0 ; VI-NEXT: v_bcnt_u32_b32 v10, v10, 0 ; VI-NEXT: v_bcnt_u32_b32 v11, v11, 0 -; VI-NEXT: v_and_b32_e32 v7, s0, v7 -; VI-NEXT: v_and_b32_e32 v6, s0, v6 -; VI-NEXT: v_and_b32_e32 v5, s0, v5 -; VI-NEXT: v_and_b32_e32 v4, s0, v4 +; VI-NEXT: v_and_b32_e32 v7, s4, v7 +; VI-NEXT: v_and_b32_e32 v6, s4, v6 +; VI-NEXT: v_and_b32_e32 v5, s4, v5 +; VI-NEXT: v_and_b32_e32 v4, s4, v4 ; VI-NEXT: v_bcnt_u32_b32 v3, v3, 0 ; VI-NEXT: v_bcnt_u32_b32 v2, v2, 0 ; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0 @@ -834,8 +841,8 @@ ; VI-NEXT: v_or_b32_e32 v6, v6, v13 ; VI-NEXT: v_or_b32_e32 v5, v5, v14 ; VI-NEXT: v_or_b32_e32 v4, v4, v15 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctpop_v16i16: @@ -1019,36 +1026,37 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: v_ctpop_i16_add_inline_constant: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 4 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctpop_i16_add_inline_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_bcnt_u32_b32 v0, v0, 4 -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctpop_i16_add_inline_constant: @@ -1090,36 +1098,37 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: v_ctpop_i16_add_inline_constant_inv: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 4 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctpop_i16_add_inline_constant_inv: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_bcnt_u32_b32 v0, v0, 4 -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctpop_i16_add_inline_constant_inv: @@ -1161,38 +1170,39 @@ define amdgpu_kernel void @v_ctpop_i16_add_literal(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: v_ctpop_i16_add_literal: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_movk_i32 s0, 0x3e7 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_movk_i32 s4, 0x3e7 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, s0 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, s4 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctpop_i16_add_literal: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_movk_i32 s4, 0x3e7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_movk_i32 s0, 0x3e7 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_bcnt_u32_b32 v0, v0, s0 -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: v_bcnt_u32_b32 v0, v0, s4 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctpop_i16_add_literal: @@ -1234,38 +1244,39 @@ define amdgpu_kernel void @v_ctpop_i16_add_var(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in, i16 %const) nounwind { ; SI-LABEL: v_ctpop_i16_add_var: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_load_dword s0, s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, s0 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, s8 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctpop_i16_add_var: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_bcnt_u32_b32 v0, v0, s0 -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: v_bcnt_u32_b32 v0, v0, s4 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctpop_i16_add_var: @@ -1314,38 +1325,39 @@ define amdgpu_kernel void @v_ctpop_i16_add_var_inv(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in, i16 %const) nounwind { ; SI-LABEL: v_ctpop_i16_add_var_inv: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_load_dword s0, s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, s0 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, s8 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctpop_i16_add_var_inv: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_bcnt_u32_b32 v0, v0, s0 -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: v_bcnt_u32_b32 v0, v0, s4 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctpop_i16_add_var_inv: @@ -1394,44 +1406,45 @@ define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in, i16 addrspace(1)* noalias %constptr) nounwind { ; SI-LABEL: v_ctpop_i16_add_vvar_inv: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_mov_b64 s[10:11], s[6:7] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bcnt_u32_b32_e32 v0, v2, v0 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctpop_i16_add_vvar_inv: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_bcnt_u32_b32 v0, v3, v0 -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctpop_i16_add_vvar_inv: @@ -1485,15 +1498,14 @@ ; SI-LABEL: ctpop_i16_in_br: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dword s4, s[0:1], 0xd -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s2, s4, 16 -; SI-NEXT: s_cmp_lg_u32 s2, 0 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: s_cmp_lg_u32 s5, 0 ; SI-NEXT: s_cbranch_scc0 .LBB14_2 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s8, s2 ; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 offset:2 @@ -1501,7 +1513,6 @@ ; SI-NEXT: s_cbranch_execz .LBB14_3 ; SI-NEXT: s_branch .LBB14_4 ; SI-NEXT: .LBB14_2: -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[2:3], -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: .LBB14_3: ; %if @@ -1518,33 +1529,35 @@ ; ; VI-LABEL: ctpop_i16_in_br: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s2, s[0:1], 0x34 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_cmp_ne_u16_e64 s[0:1], s0, 0 -; VI-NEXT: s_and_b64 vcc, exec, s[0:1] +; VI-NEXT: s_lshr_b32 s5, s4, 16 +; VI-NEXT: v_cmp_ne_u16_e64 s[6:7], s5, 0 +; VI-NEXT: s_and_b64 vcc, exec, s[6:7] ; VI-NEXT: s_cbranch_vccz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %else ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 offset:2 +; VI-NEXT: s_mov_b64 s[2:3], 0 ; VI-NEXT: s_cbranch_execz .LBB14_3 ; VI-NEXT: s_branch .LBB14_4 ; VI-NEXT: .LBB14_2: +; VI-NEXT: s_mov_b64 s[2:3], -1 ; VI-NEXT: ; implicit-def: $vgpr0 ; VI-NEXT: .LBB14_3: ; %if -; VI-NEXT: s_and_b32 s0, s2, 0xffff -; VI-NEXT: s_bcnt1_i32_b32 s0, s0 +; VI-NEXT: s_and_b32 s2, s4, 0xffff +; VI-NEXT: s_bcnt1_i32_b32 s2, s2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: .LBB14_4: ; %endif -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: ctpop_i16_in_br: diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll --- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll @@ -164,8 +164,8 @@ } ; FUNC-LABEL: {{^}}s_ctpop_i65: -; GCN: s_bcnt1_i32_b64 [[REG0:s[0-9]+]], ; GCN: s_and_b32 +; GCN: s_bcnt1_i32_b64 [[REG0:s[0-9]+]], ; GCN: s_bcnt1_i32_b64 [[REG1:s[0-9]+]], ; GCN: s_add_i32 {{s[0-9]+}}, [[REG0]], [[REG1]] ; GCN: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll --- a/llvm/test/CodeGen/AMDGPU/cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz.ll @@ -35,15 +35,15 @@ ; ; VI-LABEL: s_cttz_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ff1_i32_b32 s0, s0 -; VI-NEXT: s_min_u32 s0, s0, 32 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_ff1_i32_b32 s4, s4 +; VI-NEXT: s_min_u32 s4, s4, 32 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_cttz_i32: @@ -110,20 +110,20 @@ ; ; VI-LABEL: v_cttz_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v0, v0 ; VI-NEXT: v_min_u32_e32 v0, 32, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_cttz_i32: @@ -205,22 +205,22 @@ ; ; VI-LABEL: v_cttz_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v1, v1 ; VI-NEXT: v_ffbl_b32_e32 v0, v0 ; VI-NEXT: v_min_u32_e32 v1, 32, v1 ; VI-NEXT: v_min_u32_e32 v0, 32, v0 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_cttz_v2i32: @@ -313,16 +313,16 @@ ; ; VI-LABEL: v_cttz_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v3, v3 ; VI-NEXT: v_ffbl_b32_e32 v2, v2 @@ -332,7 +332,7 @@ ; VI-NEXT: v_min_u32_e32 v2, 32, v2 ; VI-NEXT: v_min_u32_e32 v1, 32, v1 ; VI-NEXT: v_min_u32_e32 v0, 32, v0 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_cttz_v4i32: @@ -431,18 +431,19 @@ ; ; VI-LABEL: v_cttz_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s2, s6 -; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; VI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_e32 v0, 0x100, v0 ; VI-NEXT: v_ffbl_b32_e32 v0, v0 -; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_cttz_i8: @@ -525,17 +526,17 @@ ; ; VI-LABEL: s_cttz_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ff1_i32_b32 s1, s1 -; VI-NEXT: v_add_u32_e64 v0, s[2:3], s1, 32 clamp -; VI-NEXT: s_ff1_i32_b32 s0, s0 -; VI-NEXT: v_min3_u32 v0, s0, v0, 64 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_ff1_i32_b32 s5, s5 +; VI-NEXT: v_add_u32_e64 v0, s[6:7], s5, 32 clamp +; VI-NEXT: s_ff1_i32_b32 s4, s4 +; VI-NEXT: v_min3_u32 v0, s4, v0, 64 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_cttz_i64: @@ -608,16 +609,16 @@ ; ; VI-LABEL: s_cttz_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ff1_i32_b32 s1, s1 -; VI-NEXT: v_add_u32_e64 v0, s[2:3], s1, 32 clamp -; VI-NEXT: s_ff1_i32_b32 s0, s0 -; VI-NEXT: v_min3_u32 v0, s0, v0, 64 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_ff1_i32_b32 s5, s5 +; VI-NEXT: v_add_u32_e64 v0, s[6:7], s5, 32 clamp +; VI-NEXT: s_ff1_i32_b32 s4, s4 +; VI-NEXT: v_min3_u32 v0, s4, v0, 64 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_cttz_i64_trunc: @@ -693,17 +694,17 @@ ; ; VI-LABEL: v_cttz_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s3 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v3 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v3 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v3 +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v3 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v1, v1 @@ -805,17 +806,17 @@ ; ; VI-LABEL: v_cttz_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s3 -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] -; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v4, s1 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v0, v2 @@ -915,19 +916,19 @@ ; ; VI-LABEL: v_cttz_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_cttz_i32_sel_eq_neg1: @@ -1011,19 +1012,19 @@ ; ; VI-LABEL: v_cttz_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_cttz_i32_sel_ne_neg1: @@ -1111,22 +1112,22 @@ ; ; VI-LABEL: v_cttz_i32_sel_eq_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v0, v0 ; VI-NEXT: v_min_u32_e32 v0, 32, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_cttz_i32_sel_eq_bitwidth: @@ -1218,22 +1219,22 @@ ; ; VI-LABEL: v_cttz_i32_sel_ne_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v0, v0 ; VI-NEXT: v_min_u32_e32 v0, 32, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_cttz_i32_sel_ne_bitwidth: @@ -1321,18 +1322,18 @@ ; ; VI-LABEL: v_cttz_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v0, v0 -; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_cttz_i8_sel_eq_neg1: @@ -1422,14 +1423,14 @@ ; ; VI-LABEL: v_cttz_i16_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s2, s6 -; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0xffff ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_e32 v2, 0x10000, v0 @@ -1437,7 +1438,8 @@ ; VI-NEXT: v_min_u32_e32 v2, 32, v2 ; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_cttz_i16_sel_eq_neg1: @@ -1530,19 +1532,19 @@ ; ; VI-LABEL: v_cttz_i7_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v0, v0 ; VI-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_cttz_i7_sel_eq_neg1: diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -28,13 +28,13 @@ ; ; VI-LABEL: s_cttz_zero_undef_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_ff1_i32_b32 s0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_ff1_i32_b32 s2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -51,8 +51,8 @@ ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_ff1_i32_b32 s0, s4 @@ -85,18 +85,18 @@ ; ; VI-LABEL: v_cttz_zero_undef_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -121,15 +121,15 @@ ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[4:5] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid @@ -161,16 +161,16 @@ ; ; VI-LABEL: v_cttz_zero_undef_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v1, v1 ; VI-NEXT: v_ffbl_b32_e32 v0, v0 @@ -199,16 +199,16 @@ ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_v2i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[4:5] +; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid @@ -242,16 +242,16 @@ ; ; VI-LABEL: v_cttz_zero_undef_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v3, v3 ; VI-NEXT: v_ffbl_b32_e32 v2, v2 @@ -284,18 +284,18 @@ ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_v4i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[4:5] +; GFX9-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v3, v3 -; GFX9-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX9-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid @@ -320,13 +320,13 @@ ; ; VI-LABEL: s_cttz_zero_undef_i8_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_ff1_i32_b32 s0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_ff1_i32_b32 s2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -361,8 +361,8 @@ ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i8_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_ff1_i32_b32 s0, s4 @@ -391,13 +391,13 @@ ; ; VI-LABEL: s_cttz_zero_undef_i16_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_ff1_i32_b32 s0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_ff1_i32_b32 s2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -432,8 +432,8 @@ ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i16_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_ff1_i32_b32 s0, s4 @@ -462,13 +462,13 @@ ; ; VI-LABEL: s_cttz_zero_undef_i32_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_ff1_i32_b32 s0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_ff1_i32_b32 s2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -485,8 +485,8 @@ ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i32_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_ff1_i32_b32 s0, s4 @@ -519,17 +519,17 @@ ; ; VI-LABEL: s_cttz_zero_undef_i64_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: s_ff1_i32_b32 s1, s1 -; VI-NEXT: s_ff1_i32_b32 s0, s0 -; VI-NEXT: s_add_i32 s1, s1, 32 -; VI-NEXT: s_min_u32 s0, s0, s1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_ff1_i32_b32 s3, s3 +; VI-NEXT: s_ff1_i32_b32 s2, s2 +; VI-NEXT: s_add_i32 s3, s3, 32 +; VI-NEXT: s_min_u32 s2, s2, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -551,15 +551,15 @@ ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_ff1_i32_b64 s0, s[4:5] +; GFX9-GISEL-NEXT: s_ff1_i32_b64 s0, s[2:3] ; GFX9-GISEL-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone %cttz_ret = icmp ne i64 %val, 0 @@ -589,18 +589,18 @@ ; ; VI-LABEL: v_cttz_zero_undef_i8_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v1, v0 ; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -635,17 +635,17 @@ ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i8_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[4:5] +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc -; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[2:3] +; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i8, i8 addrspace(1)* %arrayidx, align 1 %cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone @@ -680,15 +680,15 @@ ; ; VI-LABEL: v_cttz_zero_undef_i16_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s0, 1 -; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: s_add_u32 s4, s2, 1 +; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(1) @@ -698,8 +698,8 @@ ; VI-NEXT: v_ffbl_b32_e32 v1, v0 ; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -734,19 +734,19 @@ ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i16_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[4:5] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[4:5] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc -; GFX9-GISEL-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i16, i16 addrspace(1)* %arrayidx, align 1 %cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone @@ -787,23 +787,23 @@ ; ; VI-LABEL: v_cttz_zero_undef_i32_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s0, 3 -; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: s_add_u32 s4, s2, 3 +; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: s_add_u32 s4, s0, 2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_add_u32 s0, s0, 1 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_add_u32 s4, s2, 2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_addc_u32 s5, s3, 0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_add_u32 s2, s2, 1 +; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v7, s1 +; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_mov_b32_e32 v6, s0 +; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v3, v[4:5] ; VI-NEXT: flat_load_ubyte v4, v[6:7] @@ -819,8 +819,8 @@ ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_ffbl_b32_e32 v0, v0 ; VI-NEXT: v_min_u32_e32 v2, 32, v0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -848,14 +848,14 @@ ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[4:5] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[4:5] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[4:5] offset:2 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[4:5] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:3 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) @@ -866,7 +866,7 @@ ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i32, i32 addrspace(1)* %arrayidx, align 1 %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone @@ -921,73 +921,74 @@ ; ; VI-LABEL: v_cttz_zero_undef_i64_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s0, 5 -; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: s_add_u32 s4, s2, 5 +; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_add_u32 s4, s0, 4 -; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: s_add_u32 s4, s2, 4 +; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: s_add_u32 s4, s0, 7 -; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: s_add_u32 s4, s2, 7 +; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: s_add_u32 s4, s0, 6 -; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: s_add_u32 s4, s2, 6 +; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v7, s5 ; VI-NEXT: v_mov_b32_e32 v6, s4 -; VI-NEXT: s_add_u32 s4, s0, 3 -; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: s_add_u32 s4, s2, 3 +; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v9, s5 ; VI-NEXT: v_mov_b32_e32 v8, s4 -; VI-NEXT: s_add_u32 s4, s0, 2 -; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: s_add_u32 s4, s2, 2 +; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v11, s5 ; VI-NEXT: v_mov_b32_e32 v10, s4 -; VI-NEXT: s_add_u32 s4, s0, 1 -; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v13, s5 -; VI-NEXT: v_mov_b32_e32 v15, s1 -; VI-NEXT: v_mov_b32_e32 v12, s4 -; VI-NEXT: v_mov_b32_e32 v14, s0 +; VI-NEXT: flat_load_ubyte v12, v[0:1] +; VI-NEXT: flat_load_ubyte v13, v[2:3] +; VI-NEXT: flat_load_ubyte v4, v[4:5] +; VI-NEXT: flat_load_ubyte v5, v[6:7] +; VI-NEXT: s_add_u32 s4, s2, 1 +; VI-NEXT: flat_load_ubyte v6, v[8:9] +; VI-NEXT: s_addc_u32 s5, s3, 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_load_ubyte v7, v[10:11] ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: flat_load_ubyte v2, v[2:3] -; VI-NEXT: flat_load_ubyte v3, v[4:5] -; VI-NEXT: flat_load_ubyte v4, v[6:7] -; VI-NEXT: flat_load_ubyte v5, v[8:9] -; VI-NEXT: flat_load_ubyte v6, v[10:11] -; VI-NEXT: flat_load_ubyte v7, v[12:13] -; VI-NEXT: flat_load_ubyte v8, v[14:15] ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v12 ; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: v_or_b32_e32 v3, v3, v13 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, v2, v0 -; VI-NEXT: v_ffbl_b32_e32 v2, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v4, v3 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v2 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6 +; VI-NEXT: v_ffbl_b32_e32 v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 32, v5 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v4, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v7 -; VI-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_e32 v2, v2, v8 -; VI-NEXT: v_or_b32_e32 v2, v0, v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: v_or_b32_e32 v2, v4, v0 ; VI-NEXT: v_ffbl_b32_e32 v0, v2 ; VI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; VI-NEXT: v_min_u32_e32 v0, v0, v4 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_min_u32_e32 v0, v0, v5 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1027,35 +1028,37 @@ ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: s_movk_i32 s0, 0xff +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[4:5] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v1, s[4:5] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v1, s[4:5] offset:2 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v1, s[4:5] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v5, v1, s[4:5] offset:4 -; GFX9-GISEL-NEXT: global_load_ubyte v6, v1, s[4:5] offset:5 -; GFX9-GISEL-NEXT: global_load_ubyte v7, v1, s[4:5] offset:6 -; GFX9-GISEL-NEXT: global_load_ubyte v8, v1, s[4:5] offset:7 +; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[2:3] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v2, v1, s[2:3] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v1, s[2:3] offset:5 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v1, s[2:3] offset:7 +; GFX9-GISEL-NEXT: global_load_ubyte v5, v1, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v6, v1, s[2:3] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v7, v1, s[2:3] offset:4 +; GFX9-GISEL-NEXT: global_load_ubyte v8, v1, s[2:3] offset:6 +; GFX9-GISEL-NEXT: s_movk_i32 s2, 0xff +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(7) +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(6) -; GFX9-GISEL-NEXT: v_and_b32_e32 v2, s0, v2 -; GFX9-GISEL-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-GISEL-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(5) +; GFX9-GISEL-NEXT: v_and_b32_e32 v3, s2, v3 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(4) -; GFX9-GISEL-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX9-GISEL-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX9-GISEL-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX9-GISEL-NEXT: v_lshlrev_b16_e32 v4, 8, v4 -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) -; GFX9-GISEL-NEXT: v_and_b32_e32 v6, s0, v6 -; GFX9-GISEL-NEXT: v_lshlrev_b16_e32 v6, 8, v6 +; GFX9-GISEL-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-GISEL-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX9-GISEL-NEXT: v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_and_b32_e32 v8, s0, v8 -; GFX9-GISEL-NEXT: v_lshlrev_b16_e32 v8, 8, v8 -; GFX9-GISEL-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-GISEL-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-GISEL-NEXT: v_or_b32_sdwa v3, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-GISEL-NEXT: v_or_b32_sdwa v4, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-GISEL-NEXT: v_or_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-GISEL-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-GISEL-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-GISEL-NEXT: v_bfe_u32 v3, v3, 0, 16 ; GFX9-GISEL-NEXT: v_bfe_u32 v4, v4, 0, 16 ; GFX9-GISEL-NEXT: v_bfe_u32 v0, v0, 0, 16 @@ -1068,7 +1071,7 @@ ; GFX9-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX9-GISEL-NEXT: v_min_u32_e32 v0, v0, v4 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc -; GFX9-GISEL-NEXT: global_store_dwordx2 v1, v[0:1], s[2:3] +; GFX9-GISEL-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i64, i64 addrspace(1)* %arrayidx, align 1 %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone @@ -1108,23 +1111,23 @@ ; ; VI-LABEL: v_cttz_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s0, 3 -; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: s_add_u32 s4, s2, 3 +; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: s_add_u32 s4, s0, 2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_add_u32 s0, s0, 1 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_add_u32 s4, s2, 2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_addc_u32 s5, s3, 0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_add_u32 s2, s2, 1 +; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v7, s1 +; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_mov_b32_e32 v6, s0 +; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v3, v[4:5] ; VI-NEXT: flat_load_ubyte v4, v[6:7] @@ -1139,8 +1142,8 @@ ; VI-NEXT: v_or_b32_e32 v0, v2, v0 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_ffbl_b32_e32 v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1170,14 +1173,14 @@ ; ; GFX9-GISEL-LABEL: v_cttz_i32_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[4:5] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[4:5] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[4:5] offset:2 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[4:5] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:3 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) @@ -1189,7 +1192,7 @@ ; GFX9-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i32, i32 addrspace(1)* %arrayidx, align 1 %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone @@ -1229,23 +1232,23 @@ ; ; VI-LABEL: v_cttz_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s0, 3 -; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: s_add_u32 s4, s2, 3 +; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: s_add_u32 s4, s0, 2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_add_u32 s0, s0, 1 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_add_u32 s4, s2, 2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_addc_u32 s5, s3, 0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_add_u32 s2, s2, 1 +; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v7, s1 +; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_mov_b32_e32 v6, s0 +; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v3, v[4:5] ; VI-NEXT: flat_load_ubyte v4, v[6:7] @@ -1260,8 +1263,8 @@ ; VI-NEXT: v_or_b32_e32 v0, v2, v0 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_ffbl_b32_e32 v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1291,14 +1294,14 @@ ; ; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[4:5] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[4:5] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[4:5] offset:2 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[4:5] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:3 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) @@ -1310,7 +1313,7 @@ ; GFX9-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v2, vcc -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i32, i32 addrspace(1)* %arrayidx, align 1 %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone @@ -1353,23 +1356,23 @@ ; ; VI-LABEL: v_cttz_i32_sel_ne_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s0, 3 -; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: s_add_u32 s4, s2, 3 +; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: s_add_u32 s4, s0, 2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_add_u32 s0, s0, 1 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_add_u32 s4, s2, 2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_addc_u32 s5, s3, 0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_add_u32 s2, s2, 1 +; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v7, s1 +; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_mov_b32_e32 v6, s0 +; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v3, v[4:5] ; VI-NEXT: flat_load_ubyte v4, v[6:7] @@ -1387,8 +1390,8 @@ ; VI-NEXT: v_min_u32_e32 v0, 32, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 ; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1420,14 +1423,14 @@ ; ; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[4:5] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[4:5] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[4:5] offset:2 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[4:5] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:3 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) @@ -1439,7 +1442,7 @@ ; GFX9-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 32, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v1, vcc -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i32, i32 addrspace(1)* %arrayidx, align 1 %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone @@ -1468,11 +1471,11 @@ ; ; VI-LABEL: v_cttz_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, 0xff ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1480,8 +1483,8 @@ ; VI-NEXT: v_ffbl_b32_e32 v2, v2 ; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1514,19 +1517,19 @@ ; ; GFX9-GISEL-LABEL: v_cttz_i8_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xff ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[4:5] +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_or_b32_e32 v3, 0x100, v1 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v3, v3 ; GFX9-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc -; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[2:3] +; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i8, i8 addrspace(1)* %arrayidx, align 1 %ctlz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone @@ -1559,15 +1562,15 @@ ; ; VI-LABEL: v_cttz_i16_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s0, 1 -; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: s_add_u32 s4, s2, 1 +; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, 0xffff @@ -1580,8 +1583,8 @@ ; VI-NEXT: v_min_u32_e32 v2, 32, v2 ; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1614,13 +1617,13 @@ ; ; GFX9-GISEL-LABEL: v_cttz_i16_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[4:5] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[4:5] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1 @@ -1628,7 +1631,7 @@ ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-GISEL-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i16, i16 addrspace(1)* %arrayidx, align 1 %ctlz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -20,6 +20,12 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_uitofp_i32_to_f32_mask255: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] %masked = and i32 %arg0, 255 %cvt = uitofp i32 %masked to float ret float %cvt @@ -38,6 +44,12 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_sitofp_i32_to_f32_mask255: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] %masked = and i32 %arg0, 255 %cvt = sitofp i32 %masked to float ret float %cvt @@ -58,6 +70,13 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 7, v0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_uitofp_to_f32_lshr7_mask255: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 7, v0 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] %lshr.7 = lshr i32 %arg0, 7 %masked = and i32 %lshr.7, 255 %cvt = uitofp i32 %masked to float @@ -77,6 +96,12 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_uitofp_to_f32_lshr8_mask255: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] %lshr.8 = lshr i32 %arg0, 8 %masked = and i32 %lshr.8, 255 %cvt = uitofp i32 %masked to float @@ -115,6 +140,15 @@ ; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: global_store_dword v[0:1], v1, off +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] %lshr.8 = lshr i32 %arg0, 8 store i32 %lshr.8, i32 addrspace(1)* undef %masked = and i32 %lshr.8, 255 @@ -135,6 +169,12 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_uitofp_to_f32_lshr16_mask255: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] %lshr.16 = lshr i32 %arg0, 16 %masked = and i32 %lshr.16, 255 %cvt = uitofp i32 %masked to float @@ -154,6 +194,12 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_uitofp_to_f32_lshr24_mask255: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] %lshr.16 = lshr i32 %arg0, 24 %masked = and i32 %lshr.16, 255 %cvt = uitofp i32 %masked to float @@ -173,6 +219,12 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_uitofp_i8_to_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] %cvt = uitofp i8 %arg0 to float ret float %cvt } @@ -194,6 +246,14 @@ ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_uitofp_v2i8_to_v2f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] %val = bitcast i16 %arg0 to <2 x i8> %cvt = uitofp <2 x i8> %val to <2 x float> ret <2 x float> %cvt @@ -218,6 +278,15 @@ ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_uitofp_v3i8_to_v3f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, v0 +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 +; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] %trunc = trunc i32 %arg0 to i24 %val = bitcast i24 %trunc to <3 x i8> %cvt = uitofp <3 x i8> %val to <3 x float> @@ -245,6 +314,16 @@ ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_uitofp_v4i8_to_v4f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v4, v0 +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 +; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 +; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: s_setpc_b64 s[30:31] %val = bitcast i32 %arg0 to <4 x i8> %cvt = uitofp <4 x i8> %val to <4 x float> ret <4 x float> %cvt @@ -271,6 +350,16 @@ ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_uitofp_unpack_i32_to_v4f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v4, v0 +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 +; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 +; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: s_setpc_b64 s[30:31] %mask.arg0 = and i32 %arg0, 255 %cvt0 = uitofp i32 %mask.arg0 to float @@ -316,6 +405,13 @@ ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_uitofp_i32_to_f16_mask255: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] %masked = and i32 %arg0, 255 %cvt = uitofp i32 %masked to half ret half %cvt @@ -344,6 +440,13 @@ ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_sitofp_i32_to_f16_mask255: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] %masked = and i32 %arg0, 255 %cvt = sitofp i32 %masked to half ret half %cvt @@ -372,6 +475,13 @@ ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_uitofp_to_f16_lshr8_mask255: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 +; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] %lshr.8 = lshr i32 %arg0, 8 %masked = and i32 %lshr.8, 255 %cvt = uitofp i32 %masked to half @@ -401,6 +511,13 @@ ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_uitofp_to_f16_lshr16_mask255: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 +; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] %lshr.16 = lshr i32 %arg0, 16 %masked = and i32 %lshr.16, 255 %cvt = uitofp i32 %masked to half @@ -430,6 +547,13 @@ ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_uitofp_to_f16_lshr24_mask255: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 +; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] %lshr.16 = lshr i32 %arg0, 24 %masked = and i32 %lshr.16, 255 %cvt = uitofp i32 %masked to half @@ -457,6 +581,12 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_uitofp_i8_to_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; GFX9-NEXT: s_setpc_b64 s[30:31] %cvt = uitofp i8 %arg0 to half ret half %cvt } @@ -476,6 +606,13 @@ ; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_uitofp_i32_to_f64_mask255: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] %masked = and i32 %arg0, 255 %cvt = uitofp i32 %masked to double ret double %cvt @@ -496,6 +633,13 @@ ; GFX10-NEXT: v_bfe_u32 v0, v0, 8, 8 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_uitofp_to_f64_lshr8_mask255: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_bfe_u32 v0, v0, 8, 8 +; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] %lshr.8 = lshr i32 %arg0, 8 %masked = and i32 %lshr.8, 255 %cvt = uitofp i32 %masked to double @@ -517,6 +661,13 @@ ; GFX10-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_uitofp_to_f64_lshr16_mask255: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_bfe_u32 v0, v0, 16, 8 +; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] %lshr.16 = lshr i32 %arg0, 16 %masked = and i32 %lshr.16, 255 %cvt = uitofp i32 %masked to double @@ -538,6 +689,13 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_uitofp_to_f64_lshr24_mask255: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] %lshr.16 = lshr i32 %arg0, 24 %masked = and i32 %lshr.16, 255 %cvt = uitofp i32 %masked to double @@ -568,6 +726,14 @@ ; GFX10-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_uitofp_i8_to_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0xffff +; GFX9-NEXT: v_and_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] %cvt = uitofp i8 %arg0 to double ret double %cvt } @@ -575,34 +741,35 @@ define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: load_i8_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_i8_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: load_i8_to_f32: @@ -616,6 +783,23 @@ ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX9-LABEL: load_i8_to_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid %load = load i8, i8 addrspace(1)* %gep, align 1 @@ -627,38 +811,39 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: load_v2i8_to_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v2i8_to_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: load_v2i8_to_v2f32: @@ -674,6 +859,25 @@ ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX9-LABEL: load_v2i8_to_v2f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %in, i32 %tid %load = load <2 x i8>, <2 x i8> addrspace(1)* %gep, align 2 @@ -685,41 +889,42 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: load_v3i8_to_v3f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v2 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v3i8_to_v3f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: load_v3i8_to_v3f32: @@ -736,6 +941,26 @@ ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX10-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX9-LABEL: load_v3i8_to_v3f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid %load = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4 @@ -747,42 +972,43 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v4i8_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: load_v4i8_to_v4f32: @@ -800,6 +1026,27 @@ ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX9-LABEL: load_v4i8_to_v4f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 +; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4 @@ -815,59 +1062,61 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_unaligned: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:1 -; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:2 -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 offset:3 +; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2 +; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[4:7], 0 addr64 offset:1 +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v2 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v5 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v0 -; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v4i8_to_v4f32_unaligned: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v0 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v4, v[4:5] -; VI-NEXT: flat_load_ubyte v5, v[6:7] -; VI-NEXT: flat_load_ubyte v6, v[2:3] -; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: flat_load_ubyte v2, v[2:3] +; VI-NEXT: flat_load_ubyte v3, v[4:5] +; VI-NEXT: flat_load_ubyte v4, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ubyte v1, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v5 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v6 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: load_v4i8_to_v4f32_unaligned: @@ -892,6 +1141,33 @@ ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 ; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX9-LABEL: load_v4i8_to_v4f32_unaligned: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: global_load_ubyte v2, v[0:1], off offset:3 +; GFX9-NEXT: global_load_ubyte v4, v[0:1], off offset:2 +; GFX9-NEXT: global_load_ubyte v5, v[0:1], off offset:1 +; GFX9-NEXT: global_load_ubyte v6, v[0:1], off +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, v2 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v6 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1 @@ -905,20 +1181,19 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_2_uses: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_movk_i32 s0, 0xff -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_movk_i32 s8, 0xff +; SI-NEXT: s_mov_b32 s6, s2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v4 @@ -928,39 +1203,40 @@ ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v0, s0, v4 +; SI-NEXT: v_and_b32_e32 v0, s8, v4 ; SI-NEXT: v_add_i32_e32 v2, vcc, 9, v5 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v6 ; SI-NEXT: v_or_b32_e32 v0, v7, v0 -; SI-NEXT: v_and_b32_e32 v2, s0, v2 +; SI-NEXT: v_and_b32_e32 v2, s8, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x900, v0 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v4i8_to_v4f32_2_uses: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v5, 9 +; VI-NEXT: s_movk_i32 s8, 0x900 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: s_mov_b32 s10, s6 -; VI-NEXT: s_mov_b32 s11, s7 -; VI-NEXT: s_movk_i32 s0, 0x900 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v6, 24, v4 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 @@ -970,16 +1246,17 @@ ; VI-NEXT: v_and_b32_e32 v7, 0xffffff00, v4 ; VI-NEXT: v_add_u16_e32 v8, 9, v4 ; VI-NEXT: v_add_u16_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_nop 0 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v6 ; VI-NEXT: v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u16_e32 v0, s0, v0 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_add_u16_e32 v0, s8, v0 ; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: load_v4i8_to_v4f32_2_uses: @@ -1015,6 +1292,42 @@ ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX10-NEXT: global_store_dword v4, v5, s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX9-LABEL: load_v4i8_to_v4f32_2_uses: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_mov_b32_e32 v6, 9 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2c +; GFX9-NEXT: s_movk_i32 s4, 0x900 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 +; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffffff00, v4 +; GFX9-NEXT: v_add_u16_e32 v9, 9, v4 +; GFX9-NEXT: v_add_u16_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v7 +; GFX9-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, s4, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: global_store_dword v5, v0, s[2:3] +; GFX9-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x %load = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4 @@ -1029,87 +1342,86 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: load_v7i8_to_v7f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:1 -; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:2 -; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:3 -; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:4 -; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:5 -; SI-NEXT: buffer_load_ubyte v8, v[0:1], s[0:3], 0 addr64 offset:6 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 offset:3 +; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2 +; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[4:7], 0 addr64 offset:1 +; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[4:7], 0 addr64 offset:5 +; SI-NEXT: buffer_load_ubyte v8, v[0:1], s[4:7], 0 addr64 offset:4 +; SI-NEXT: buffer_load_ubyte v9, v[0:1], s[4:7], 0 addr64 offset:6 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v2 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v5 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v5 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v3 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v6 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v6 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v5, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_ubyte0_e32 v6, v8 -; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:24 -; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v6, v9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:24 +; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v7i8_to_v7f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 5, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 5, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v12, v[4:5] -; VI-NEXT: v_add_u32_e32 v4, vcc, 6, v0 +; VI-NEXT: flat_load_ubyte v10, v[2:3] +; VI-NEXT: v_add_u32_e32 v2, vcc, 6, v0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 2, v0 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v8, vcc, 2, v0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v0 -; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v8, v[8:9] -; VI-NEXT: flat_load_ubyte v9, v[10:11] ; VI-NEXT: flat_load_ubyte v6, v[6:7] +; VI-NEXT: flat_load_ubyte v7, v[8:9] +; VI-NEXT: flat_load_ubyte v8, v[2:3] +; VI-NEXT: flat_load_ubyte v2, v[0:1] ; VI-NEXT: flat_load_ubyte v4, v[4:5] -; VI-NEXT: flat_load_ubyte v7, v[2:3] -; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ubyte v9, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v5, v12 -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v8 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v5, v10 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v9 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v6 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v7 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v4 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v6 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v7 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v8 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v9 +; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[0:3], 0 offset:16 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: load_v7i8_to_v7f32: @@ -1142,6 +1454,42 @@ ; GFX10-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] offset:16 ; GFX10-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX9-LABEL: load_v7i8_to_v7f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: global_load_ushort v2, v[0:1], off offset:4 +; GFX9-NEXT: global_load_ubyte v3, v[0:1], off offset:6 +; GFX9-NEXT: global_load_ubyte v4, v[0:1], off offset:3 +; GFX9-NEXT: global_load_ubyte v5, v[0:1], off offset:2 +; GFX9-NEXT: global_load_ubyte v7, v[0:1], off offset:1 +; GFX9-NEXT: global_load_ubyte v8, v[0:1], off +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v2 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v6, v3 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, v4 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v5 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v8 +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v5, v10 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v4, v10 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx3 v9, v[4:6], s[0:1] offset:16 +; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid %load = load <7 x i8>, <7 x i8> addrspace(1)* %gep, align 1 @@ -1153,16 +1501,16 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: load_v8i8_to_v8f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx2 v[7:8], v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[7:8], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v7 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v7 @@ -1172,22 +1520,23 @@ ; SI-NEXT: v_cvt_f32_ubyte2_e32 v6, v8 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v5, v8 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v8 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v8i8_to_v8f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[7:8], v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v7 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v7 @@ -1197,8 +1546,8 @@ ; VI-NEXT: v_cvt_f32_ubyte2_e32 v6, v8 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v5, v8 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v8 -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: load_v8i8_to_v8f32: @@ -1221,6 +1570,32 @@ ; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 ; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX9-LABEL: load_v8i8_to_v8f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[7:8], v[0:1], off +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v7 +; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v7 +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v7 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v7 +; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v7, v8 +; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v6, v8 +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v5, v8 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v4, v8 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v9, v[4:7], s[0:1] offset:16 +; GFX9-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] +; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %in, i32 %tid %load = load <8 x i8>, <8 x i8> addrspace(1)* %gep, align 8 @@ -1232,38 +1607,39 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: i8_zext_inreg_i32_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 2, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: i8_zext_inreg_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: i8_zext_inreg_i32_to_f32: @@ -1279,6 +1655,25 @@ ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX9-LABEL: i8_zext_inreg_i32_to_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, 2, v0 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %load = load i32, i32 addrspace(1)* %gep, align 4 @@ -1292,36 +1687,37 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: i8_zext_inreg_hi1_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: i8_zext_inreg_hi1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: i8_zext_inreg_hi1_to_f32: @@ -1336,6 +1732,24 @@ ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX9-LABEL: i8_zext_inreg_hi1_to_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %load = load i32, i32 addrspace(1)* %gep, align 4 @@ -1351,34 +1765,35 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: i8_zext_i32_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: i8_zext_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: i8_zext_i32_to_f32: @@ -1392,6 +1807,23 @@ ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX9-LABEL: i8_zext_i32_to_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid %load = load i8, i8 addrspace(1)* %gep, align 1 @@ -1404,59 +1836,61 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: v4i8_zext_v4i32_to_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:1 -; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:2 -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 offset:3 +; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2 +; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[4:7], 0 addr64 offset:1 +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v2 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v5 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v0 -; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v4i8_zext_v4i32_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v0 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v4, v[4:5] -; VI-NEXT: flat_load_ubyte v5, v[6:7] -; VI-NEXT: flat_load_ubyte v6, v[2:3] -; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: flat_load_ubyte v2, v[2:3] +; VI-NEXT: flat_load_ubyte v3, v[4:5] +; VI-NEXT: flat_load_ubyte v4, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ubyte v1, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v5 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v6 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: v4i8_zext_v4i32_to_v4f32: @@ -1481,6 +1915,33 @@ ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 ; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX9-LABEL: v4i8_zext_v4i32_to_v4f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: global_load_ubyte v2, v[0:1], off offset:3 +; GFX9-NEXT: global_load_ubyte v4, v[0:1], off offset:2 +; GFX9-NEXT: global_load_ubyte v5, v[0:1], off offset:1 +; GFX9-NEXT: global_load_ubyte v6, v[0:1], off +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, v2 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v6 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1 @@ -1493,36 +1954,37 @@ define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: extract_byte0_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: extract_byte0_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: extract_byte0_to_f32: @@ -1537,6 +1999,24 @@ ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX9-LABEL: extract_byte0_to_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %val = load i32, i32 addrspace(1)* %gep @@ -1549,36 +2029,37 @@ define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: extract_byte1_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: extract_byte1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: extract_byte1_to_f32: @@ -1593,6 +2074,24 @@ ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX9-LABEL: extract_byte1_to_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %val = load i32, i32 addrspace(1)* %gep @@ -1606,36 +2105,37 @@ define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: extract_byte2_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: extract_byte2_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: extract_byte2_to_f32: @@ -1650,6 +2150,24 @@ ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX9-LABEL: extract_byte2_to_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %val = load i32, i32 addrspace(1)* %gep @@ -1663,36 +2181,37 @@ define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: extract_byte3_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: extract_byte3_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: extract_byte3_to_f32: @@ -1707,6 +2226,24 @@ ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX9-LABEL: extract_byte3_to_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %val = load i32, i32 addrspace(1)* %gep @@ -1720,42 +2257,42 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(i32 addrspace(1)* %in, float addrspace(1)* %out) { ; SI-LABEL: cvt_ubyte0_or_multiuse: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, 0x80000001, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 ; SI-NEXT: v_add_f32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: cvt_ubyte0_or_multiuse: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_e32 v0, 0x80000001, v0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 ; VI-NEXT: v_add_f32_e32 v0, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: cvt_ubyte0_or_multiuse: @@ -1771,6 +2308,25 @@ ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-NEXT: s_endpgm +; +; GFX9-LABEL: cvt_ubyte0_or_multiuse: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2c +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v0, 0x80000001, v0 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 +; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX9-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %lid @@ -1787,14 +2343,173 @@ %Vec = type { [4 x i8] } define amdgpu_kernel void @cvt_f32_ubyte0_vector() local_unnamed_addr { +; SI-LABEL: cvt_f32_ubyte0_vector: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:3 +; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2 +; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[4:7], 0 addr64 offset:1 +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v3 +; SI-NEXT: v_fma_f32 v1, v2, v1, 0.5 +; SI-NEXT: v_cvt_u32_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_byte v4, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_byte v5, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 +; SI-NEXT: .LBB40_1: ; %for.body.i +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_branch .LBB40_1 +; +; VI-LABEL: cvt_f32_ubyte0_vector: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: v_mov_b32_e32 v2, -1 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_mov_b64 s[4:5], exec +; VI-NEXT: .LBB40_1: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_readfirstlane_b32 s8, v0 +; VI-NEXT: v_readfirstlane_b32 s9, v1 +; VI-NEXT: v_readfirstlane_b32 s10, v2 +; VI-NEXT: v_readfirstlane_b32 s11, v3 +; VI-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; VI-NEXT: v_cmp_eq_u64_e64 s[0:1], s[10:11], v[2:3] +; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; VI-NEXT: s_nop 0 +; VI-NEXT: buffer_load_ubyte v4, off, s[8:11], 0 +; VI-NEXT: s_xor_b64 exec, exec, s[0:1] +; VI-NEXT: s_cbranch_execnz .LBB40_1 +; VI-NEXT: ; %bb.2: +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_mov_b64 s[4:5], exec +; VI-NEXT: .LBB40_3: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_readfirstlane_b32 s8, v0 +; VI-NEXT: v_readfirstlane_b32 s9, v1 +; VI-NEXT: v_readfirstlane_b32 s10, v2 +; VI-NEXT: v_readfirstlane_b32 s11, v3 +; VI-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; VI-NEXT: v_cmp_eq_u64_e64 s[0:1], s[10:11], v[2:3] +; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; VI-NEXT: s_nop 0 +; VI-NEXT: buffer_load_ubyte v5, off, s[8:11], 0 offset:1 +; VI-NEXT: s_xor_b64 exec, exec, s[0:1] +; VI-NEXT: s_cbranch_execnz .LBB40_3 +; VI-NEXT: ; %bb.4: +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_mov_b64 s[4:5], exec +; VI-NEXT: .LBB40_5: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_readfirstlane_b32 s8, v0 +; VI-NEXT: v_readfirstlane_b32 s9, v1 +; VI-NEXT: v_readfirstlane_b32 s10, v2 +; VI-NEXT: v_readfirstlane_b32 s11, v3 +; VI-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; VI-NEXT: v_cmp_eq_u64_e64 s[0:1], s[10:11], v[2:3] +; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; VI-NEXT: s_nop 0 +; VI-NEXT: buffer_load_ubyte v6, off, s[8:11], 0 offset:2 +; VI-NEXT: s_xor_b64 exec, exec, s[0:1] +; VI-NEXT: s_cbranch_execnz .LBB40_5 +; VI-NEXT: ; %bb.6: +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_mov_b64 s[4:5], exec +; VI-NEXT: .LBB40_7: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_readfirstlane_b32 s8, v0 +; VI-NEXT: v_readfirstlane_b32 s9, v1 +; VI-NEXT: v_readfirstlane_b32 s10, v2 +; VI-NEXT: v_readfirstlane_b32 s11, v3 +; VI-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; VI-NEXT: v_cmp_eq_u64_e64 s[0:1], s[10:11], v[2:3] +; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; VI-NEXT: s_nop 0 +; VI-NEXT: buffer_load_ubyte v7, off, s[8:11], 0 offset:3 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; VI-NEXT: s_xor_b64 exec, exec, s[0:1] +; VI-NEXT: s_cbranch_execnz .LBB40_7 +; VI-NEXT: ; %bb.8: +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v7 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-NEXT: v_add_f32_e32 v0, 0.5, v0 +; VI-NEXT: v_cvt_i32_f32_e32 v0, v0 +; VI-NEXT: buffer_store_byte v6, off, s[0:3], 0 +; VI-NEXT: buffer_store_byte v5, off, s[0:3], 0 +; VI-NEXT: buffer_store_byte v4, off, s[0:3], 0 +; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; VI-NEXT: .LBB40_9: ; %for.body.i +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_branch .LBB40_9 +; +; GFX10-LABEL: cvt_f32_ubyte0_vector: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_clause 0x4 +; GFX10-NEXT: global_load_ubyte v2, v[0:1], off offset:3 +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: global_load_ubyte v4, v[0:1], off offset:2 +; GFX10-NEXT: global_load_ubyte v5, v[0:1], off offset:1 +; GFX10-NEXT: global_load_ubyte v6, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(4) +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v2 +; GFX10-NEXT: s_waitcnt vmcnt(3) +; GFX10-NEXT: v_fma_f32 v0, v3, v0, 0.5 +; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: global_store_byte v[0:1], v4, off +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: global_store_byte v[0:1], v5, off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_byte v[0:1], v6, off +; GFX10-NEXT: global_store_byte v[0:1], v0, off +; GFX10-NEXT: .LBB40_1: ; %for.body.i +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_branch .LBB40_1 +; ; GFX9-LABEL: cvt_f32_ubyte0_vector: -; GFX9: ; %bb.0: ; %entry -; GFX9: global_load_ubyte [[REG0:v[0-9]+]], v[0:1], off -; GFX9: global_load_ubyte [[REG1:v[0-9]+]], v[0:1], off offset:1 -; GFX9: global_load_ubyte [[REG2:v[0-9]+]], v[0:1], off offset:2 -; GFX9: global_load_ubyte [[REG3:v[0-9]+]], v[0:1], off offset:3 -; GFX9: v_cvt_f32_ubyte0_e32 [[CVT:v[0-9]+]], [[REG3]] -; GFX9: v_fma_f32 v0, v1, [[CVT]], 0.5 +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: .LBB40_1: ; %for.body.i +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_ubyte v2, v[0:1], off offset:3 +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: global_load_ubyte v4, v[0:1], off offset:2 +; GFX9-NEXT: global_load_ubyte v5, v[0:1], off offset:1 +; GFX9-NEXT: global_load_ubyte v6, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v2 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_fma_f32 v0, v3, v0, 0.5 +; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: global_store_byte v[0:1], v4, off +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: global_store_byte v[0:1], v5, off +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: global_store_byte v[0:1], v6, off +; GFX9-NEXT: global_store_byte v[0:1], v0, off +; GFX9-NEXT: s_branch .LBB40_1 entry: br label %for.body.i diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll --- a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll @@ -448,8 +448,8 @@ define protected amdgpu_kernel void @buffer.atomic.swap(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) { ; CHECK-LABEL: buffer.atomic.swap: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -470,8 +470,8 @@ define protected amdgpu_kernel void @buffer.atomic.add(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) { ; CHECK-LABEL: buffer.atomic.add: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -492,8 +492,8 @@ define protected amdgpu_kernel void @buffer.atomic.sub(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) { ; CHECK-LABEL: buffer.atomic.sub: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -514,8 +514,8 @@ define protected amdgpu_kernel void @buffer.atomic.smin(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) { ; CHECK-LABEL: buffer.atomic.smin: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -536,8 +536,8 @@ define protected amdgpu_kernel void @buffer.atomic.smax(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) { ; CHECK-LABEL: buffer.atomic.smax: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -558,8 +558,8 @@ define protected amdgpu_kernel void @buffer.atomic.umin(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) { ; CHECK-LABEL: buffer.atomic.umin: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -580,8 +580,8 @@ define protected amdgpu_kernel void @buffer.atomic.umax(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) { ; CHECK-LABEL: buffer.atomic.umax: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -602,8 +602,8 @@ define protected amdgpu_kernel void @buffer.atomic.and(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) { ; CHECK-LABEL: buffer.atomic.and: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -624,8 +624,8 @@ define protected amdgpu_kernel void @buffer.atomic.or(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) { ; CHECK-LABEL: buffer.atomic.or: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -646,8 +646,8 @@ define protected amdgpu_kernel void @buffer.atomic.xor(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) { ; CHECK-LABEL: buffer.atomic.xor: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -668,8 +668,8 @@ define protected amdgpu_kernel void @buffer.atomic.inc(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) { ; CHECK-LABEL: buffer.atomic.inc: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -690,8 +690,8 @@ define protected amdgpu_kernel void @buffer.atomic.dec(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) { ; CHECK-LABEL: buffer.atomic.dec: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -712,8 +712,8 @@ define protected amdgpu_kernel void @buffer.atomic.cmpswap(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) { ; CHECK-LABEL: buffer.atomic.cmpswap: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v1, 2 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c @@ -735,16 +735,16 @@ define protected amdgpu_kernel void @buffer.atomic.fadd(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) { ; CHECK-LABEL: buffer.atomic.fadd: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 -; CHECK-NEXT: v_mov_b32_e32 v0, 1.0 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 1.0 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 -; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], 0 offen glc +; CHECK-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-NEXT: buffer_atomic_add_f32 v1, v0, s[4:7], 0 offen glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 +; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v1 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -759,8 +759,8 @@ define protected amdgpu_kernel void @buffer.atomic.fmin(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) { ; CHECK-LABEL: buffer.atomic.fmin: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c @@ -784,8 +784,8 @@ define protected amdgpu_kernel void @buffer.atomic.fmax(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) { ; CHECK-LABEL: buffer.atomic.fmax: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c diff --git a/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir b/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir --- a/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir +++ b/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir @@ -43,48 +43,59 @@ body: | ; CHECK-LABEL: name: dbg_value_ends_sched_region ; CHECK: bb.0: - ; CHECK: successors: %bb.1(0x80000000) - ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; CHECK: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; CHECK: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; CHECK: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK: [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; CHECK: [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; CHECK: [[DEF6:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; CHECK: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK: [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; CHECK: undef %11.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec - ; CHECK: bb.1: - ; CHECK: successors: %bb.2(0x80000000) - ; CHECK: dead %12:vreg_64 = COPY [[DEF]] - ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[DEF3]] - ; CHECK: dead %14:vgpr_32 = COPY [[DEF2]] - ; CHECK: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF5]].sub1 - ; CHECK: dead %16:vgpr_32 = COPY %11.sub0 - ; CHECK: undef %17.sub0:vreg_64, %18:sreg_64_xexec = V_ADD_CO_U32_e64 [[DEF4]].sub0, [[DEF6]].sub0, 0, implicit $exec - ; CHECK: dead undef %17.sub1:vreg_64, dead %19:sreg_64_xexec = V_ADDC_U32_e64 [[DEF4]].sub1, [[DEF6]].sub1, %18, 0, implicit $exec - ; CHECK: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF1]], 0, 0, implicit $exec :: (load (s64), addrspace 1) - ; CHECK: dead %20:sreg_64 = V_CMP_GT_I32_e64 4, [[DEF7]], implicit $exec - ; CHECK: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF8]], 288, 0, implicit $exec :: (store (s64), addrspace 1) - ; CHECK: bb.2: - ; CHECK: successors: %bb.3(0x80000000) - ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] - ; CHECK: bb.3: - ; CHECK: successors: %bb.2(0x40000000), %bb.4(0x40000000) - ; CHECK: undef [[DEF5]].sub1:vreg_64 = COPY [[COPY3]] - ; CHECK: S_CBRANCH_EXECZ %bb.2, implicit $exec - ; CHECK: bb.4: - ; CHECK: successors: %bb.5(0x80000000) - ; CHECK: dead %21:sreg_64 = COPY $exec - ; CHECK: dead %22:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY1]], 0, 0, implicit $exec :: (load (s128), addrspace 1) - ; CHECK: DBG_VALUE %22, $noreg, <0x{{[0-9a-f]+}}>, !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), debug-location !DILocation(line: 0, scope: <0x{{[0-9a-f]+}}>) - ; CHECK: bb.5: - ; CHECK: successors: %bb.3(0x40000000), %bb.1(0x40000000) - ; CHECK: S_CBRANCH_EXECZ %bb.3, implicit $exec - ; CHECK: S_BRANCH %bb.1 + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF6:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: undef %11.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: undef %17.sub0:vreg_64, %18:sreg_64_xexec = V_ADD_CO_U32_e64 [[DEF4]].sub0, [[DEF6]].sub0, 0, implicit $exec + ; CHECK-NEXT: dead undef %17.sub1:vreg_64, dead %19:sreg_64_xexec = V_ADDC_U32_e64 [[DEF4]].sub1, [[DEF6]].sub1, %18, 0, implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF1]], 0, 0, implicit $exec :: (load (s64), addrspace 1) + ; CHECK-NEXT: dead %12:vreg_64 = COPY [[DEF]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[DEF3]] + ; CHECK-NEXT: dead %14:vgpr_32 = COPY [[DEF2]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF5]].sub1 + ; CHECK-NEXT: dead %16:vgpr_32 = COPY %11.sub0 + ; CHECK-NEXT: dead %20:sreg_64 = V_CMP_GT_I32_e64 4, [[DEF7]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF8]], 288, 0, implicit $exec :: (store (s64), addrspace 1) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: undef [[DEF5]].sub1:vreg_64 = COPY [[COPY3]] + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: dead %21:sreg_64 = COPY $exec + ; CHECK-NEXT: dead %22:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY1]], 0, 0, implicit $exec :: (load (s128), addrspace 1) + ; CHECK-NEXT: DBG_VALUE %22, $noreg + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.1 bb.0: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll --- a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll @@ -230,17 +230,19 @@ ; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 -; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 offset:2 -; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:4 -; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:6 +; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:2 +; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 +; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:6 +; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:4 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(2) -; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v2 offset:2 -; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v1 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(2) -; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v0 offset:6 -; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v3 offset:4 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) +; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v1 offset:2 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) +; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v2 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) +; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v3 offset:6 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) +; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v0 offset:4 ; ALIGNED-SDAG-NEXT: s_endpgm ; ; ALIGNED-GISEL-LABEL: ds8align2: @@ -424,21 +426,24 @@ ; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:8 ; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 ; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2 ; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:4 ; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:6 -; ALIGNED-SDAG-NEXT: ds_read_u16 v6, v0 offset:8 +; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v6, s1 ; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:10 -; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v6 offset:8 -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v3 offset:2 -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v2 -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v4 offset:4 -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v5 offset:6 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v0 offset:10 +; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v1 offset:8 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(4) +; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v3 offset:2 +; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v2 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) +; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v4 offset:4 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) +; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v5 offset:6 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) +; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v0 offset:10 ; ALIGNED-SDAG-NEXT: s_endpgm ; ; ALIGNED-GISEL-LABEL: ds12align2: @@ -447,26 +452,26 @@ ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 -; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:2 -; ALIGNED-GISEL-NEXT: ds_read_u16 v4, v0 offset:4 -; ALIGNED-GISEL-NEXT: ds_read_u16 v5, v0 offset:6 +; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 offset:2 +; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:4 +; ALIGNED-GISEL-NEXT: ds_read_u16 v4, v0 offset:6 ; ALIGNED-GISEL-NEXT: ds_read_u16 v6, v0 offset:8 ; ALIGNED-GISEL-NEXT: ds_read_u16 v7, v0 offset:10 -; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s1 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v5, s1 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) -; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v3, 16, v1 +; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) -; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v5, 16, v4 -; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; ALIGNED-GISEL-NEXT: ds_write_b16 v2, v0 -; ALIGNED-GISEL-NEXT: ds_write_b16 v2, v3 offset:2 +; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v4, 16, v3 +; ALIGNED-GISEL-NEXT: ds_write_b16 v5, v0 +; ALIGNED-GISEL-NEXT: ds_write_b16 v5, v2 offset:2 ; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; ALIGNED-GISEL-NEXT: ds_write_b16 v2, v1 offset:4 -; ALIGNED-GISEL-NEXT: ds_write_b16 v2, v0 offset:6 +; ALIGNED-GISEL-NEXT: ds_write_b16 v5, v1 offset:4 +; ALIGNED-GISEL-NEXT: ds_write_b16 v5, v0 offset:6 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(5) -; ALIGNED-GISEL-NEXT: ds_write_b16 v2, v6 offset:8 +; ALIGNED-GISEL-NEXT: ds_write_b16 v5, v6 offset:8 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(5) -; ALIGNED-GISEL-NEXT: ds_write_b16 v2, v7 offset:10 +; ALIGNED-GISEL-NEXT: ds_write_b16 v5, v7 offset:10 ; ALIGNED-GISEL-NEXT: s_endpgm ; ; UNALIGNED-LABEL: ds12align2: @@ -520,12 +525,13 @@ ; ALIGNED-SDAG: ; %bb.0: ; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; ALIGNED-SDAG-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 -; ALIGNED-SDAG-NEXT: ds_read_b32 v2, v2 offset:8 +; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; ALIGNED-SDAG-NEXT: ds_read_b32 v2, v0 offset:8 +; ALIGNED-SDAG-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) ; ALIGNED-SDAG-NEXT: ds_write_b32 v3, v2 offset:8 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) ; ALIGNED-SDAG-NEXT: ds_write_b64 v3, v[0:1] ; ALIGNED-SDAG-NEXT: s_endpgm ; @@ -687,25 +693,29 @@ ; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:12 ; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 ; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2 ; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:4 ; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:6 ; ALIGNED-SDAG-NEXT: ds_read_u16 v6, v0 offset:8 ; ALIGNED-SDAG-NEXT: ds_read_u16 v7, v0 offset:10 -; ALIGNED-SDAG-NEXT: ds_read_u16 v8, v0 offset:12 +; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v8, s1 ; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:14 -; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v8 offset:12 -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v3 offset:2 -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v2 -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v4 offset:4 -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v6 offset:8 -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v5 offset:6 -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v7 offset:10 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v0 offset:14 +; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v1 offset:12 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(6) +; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v3 offset:2 +; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v2 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) +; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v4 offset:4 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(6) +; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v6 offset:8 +; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v5 offset:6 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) +; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v7 offset:10 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) +; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v0 offset:14 ; ALIGNED-SDAG-NEXT: s_endpgm ; ; ALIGNED-GISEL-LABEL: ds16align2: @@ -761,12 +771,13 @@ ; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; ALIGNED-SDAG-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 -; ALIGNED-SDAG-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 +; ALIGNED-SDAG-NEXT: ds_read2_b32 v[0:1], v2 offset0:2 offset1:3 +; ALIGNED-SDAG-NEXT: ds_read2_b32 v[2:3], v2 offset1:1 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-SDAG-NEXT: ds_write2_b32 v4, v2, v3 offset0:2 offset1:3 -; ALIGNED-SDAG-NEXT: ds_write2_b32 v4, v0, v1 offset1:1 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) +; ALIGNED-SDAG-NEXT: ds_write2_b32 v4, v0, v1 offset0:2 offset1:3 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) +; ALIGNED-SDAG-NEXT: ds_write2_b32 v4, v2, v3 offset1:1 ; ALIGNED-SDAG-NEXT: s_endpgm ; ; ALIGNED-GISEL-LABEL: ds16align4: diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -183,14 +183,14 @@ ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_barrier +; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:11 offset1:27 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: v_add_f32_e32 v1, v1, v2 ; CI-NEXT: s_mov_b32 s2, 0 -; CI-NEXT: v_add_f32_e32 v3, v1, v2 -; CI-NEXT: ds_read2_b32 v[1:2], v0 offset0:11 offset1:27 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_f32_e32 v1, v1, v2 -; CI-NEXT: v_add_f32_e32 v2, v3, v1 +; CI-NEXT: v_add_f32_e32 v2, v3, v4 +; CI-NEXT: v_add_f32_e32 v2, v1, v2 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm @@ -301,36 +301,36 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 { ; CI-LABEL: read2_ptr_is_subreg_arg_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; CI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v1, s0 -; CI-NEXT: v_mov_b32_e32 v2, s1 +; CI-NEXT: v_mov_b32_e32 v1, s2 +; CI-NEXT: v_mov_b32_e32 v2, s3 ; CI-NEXT: ds_read_b32 v1, v1 offset:32 ; CI-NEXT: ds_read_b32 v2, v2 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_add_f32_e32 v2, v1, v2 ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: read2_ptr_is_subreg_arg_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NEXT: ds_read_b32 v1, v1 offset:32 ; GFX9-NEXT: ds_read_b32 v2, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 @@ -354,36 +354,36 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 { ; CI-LABEL: read2_ptr_is_subreg_arg_offset_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; CI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v1, s0 -; CI-NEXT: v_mov_b32_e32 v2, s1 +; CI-NEXT: v_mov_b32_e32 v1, s2 +; CI-NEXT: v_mov_b32_e32 v2, s3 ; CI-NEXT: ds_read_b32 v1, v1 offset:32 ; CI-NEXT: ds_read_b32 v2, v2 offset:32 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_add_f32_e32 v2, v1, v2 ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: read2_ptr_is_subreg_arg_offset_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NEXT: ds_read_b32 v1, v1 offset:32 ; GFX9-NEXT: ds_read_b32 v2, v2 offset:32 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 @@ -526,48 +526,47 @@ define amdgpu_kernel void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { ; CI-LABEL: unaligned_read2_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; CI-NEXT: s_load_dword s0, s[0:1], 0xb +; CI-NEXT: s_load_dword s2, s[0:1], 0xb ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v1, vcc, s0, v0 -; CI-NEXT: ds_read_u8 v2, v1 offset:35 -; CI-NEXT: ds_read_u8 v3, v1 offset:34 -; CI-NEXT: ds_read_u8 v4, v1 offset:33 -; CI-NEXT: ds_read_u8 v5, v1 offset:32 -; CI-NEXT: ds_read_u8 v6, v1 offset:3 -; CI-NEXT: ds_read_u8 v7, v1 offset:2 -; CI-NEXT: ds_read_u8 v8, v1 offset:1 -; CI-NEXT: ds_read_u8 v1, v1 -; CI-NEXT: s_waitcnt lgkmcnt(7) -; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0 +; CI-NEXT: ds_read_u8 v2, v1 offset:34 +; CI-NEXT: ds_read_u8 v3, v1 offset:32 +; CI-NEXT: ds_read_u8 v4, v1 offset:3 +; CI-NEXT: ds_read_u8 v5, v1 offset:2 +; CI-NEXT: ds_read_u8 v6, v1 offset:1 +; CI-NEXT: ds_read_u8 v7, v1 +; CI-NEXT: ds_read_u8 v8, v1 offset:33 +; CI-NEXT: ds_read_u8 v1, v1 offset:35 +; CI-NEXT: s_waitcnt lgkmcnt(5) +; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; CI-NEXT: s_waitcnt lgkmcnt(3) ; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; CI-NEXT: s_waitcnt lgkmcnt(2) -; CI-NEXT: v_or_b32_e32 v6, v6, v7 +; CI-NEXT: v_or_b32_e32 v4, v4, v5 ; CI-NEXT: s_waitcnt lgkmcnt(1) -; CI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; CI-NEXT: v_or_b32_e32 v2, v2, v3 +; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_or_b32_e32 v1, v8, v1 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; CI-NEXT: v_or_b32_e32 v4, v4, v5 -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_or_b32_e32 v1, v6, v1 -; CI-NEXT: v_or_b32_e32 v2, v2, v4 -; CI-NEXT: v_add_f32_e32 v2, v1, v2 +; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; CI-NEXT: v_or_b32_e32 v1, v1, v2 +; CI-NEXT: v_or_b32_e32 v6, v6, v7 +; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; CI-NEXT: v_or_b32_e32 v3, v5, v3 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v4, v4, v6 +; CI-NEXT: v_or_b32_e32 v1, v1, v3 +; CI-NEXT: v_add_f32_e32 v2, v4, v1 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm ; ; GFX9-ALIGNED-LABEL: unaligned_read2_f32: ; GFX9-ALIGNED: ; %bb.0: -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0 @@ -595,15 +594,15 @@ ; ; GFX9-UNALIGNED-LABEL: unaligned_read2_f32: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-UNALIGNED-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s4, v2 +; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s2, v2 ; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:8 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[2:3] +; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-UNALIGNED-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i @@ -620,48 +619,47 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { ; CI-LABEL: unaligned_offset_read2_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; CI-NEXT: s_load_dword s0, s[0:1], 0xb +; CI-NEXT: s_load_dword s2, s[0:1], 0xb ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v1, vcc, s0, v0 -; CI-NEXT: ds_read_u8 v2, v1 offset:12 -; CI-NEXT: ds_read_u8 v3, v1 offset:11 -; CI-NEXT: ds_read_u8 v4, v1 offset:10 -; CI-NEXT: ds_read_u8 v5, v1 offset:9 -; CI-NEXT: ds_read_u8 v6, v1 offset:8 -; CI-NEXT: ds_read_u8 v7, v1 offset:7 -; CI-NEXT: ds_read_u8 v8, v1 offset:6 -; CI-NEXT: ds_read_u8 v1, v1 offset:5 -; CI-NEXT: s_waitcnt lgkmcnt(7) -; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0 +; CI-NEXT: ds_read_u8 v2, v1 offset:11 +; CI-NEXT: ds_read_u8 v3, v1 offset:9 +; CI-NEXT: ds_read_u8 v4, v1 offset:8 +; CI-NEXT: ds_read_u8 v5, v1 offset:7 +; CI-NEXT: ds_read_u8 v6, v1 offset:6 +; CI-NEXT: ds_read_u8 v7, v1 offset:5 +; CI-NEXT: ds_read_u8 v8, v1 offset:10 +; CI-NEXT: ds_read_u8 v1, v1 offset:12 +; CI-NEXT: s_waitcnt lgkmcnt(5) +; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; CI-NEXT: s_waitcnt lgkmcnt(3) ; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; CI-NEXT: s_waitcnt lgkmcnt(2) -; CI-NEXT: v_or_b32_e32 v6, v6, v7 +; CI-NEXT: v_or_b32_e32 v4, v4, v5 ; CI-NEXT: s_waitcnt lgkmcnt(1) -; CI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; CI-NEXT: v_or_b32_e32 v2, v2, v3 +; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_or_b32_e32 v1, v8, v1 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; CI-NEXT: v_or_b32_e32 v4, v4, v5 -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_or_b32_e32 v1, v6, v1 -; CI-NEXT: v_or_b32_e32 v2, v2, v4 -; CI-NEXT: v_add_f32_e32 v2, v1, v2 +; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; CI-NEXT: v_or_b32_e32 v1, v1, v2 +; CI-NEXT: v_or_b32_e32 v6, v6, v7 +; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; CI-NEXT: v_or_b32_e32 v3, v5, v3 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v4, v4, v6 +; CI-NEXT: v_or_b32_e32 v1, v1, v3 +; CI-NEXT: v_add_f32_e32 v2, v4, v1 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm ; ; GFX9-ALIGNED-LABEL: unaligned_offset_read2_f32: ; GFX9-ALIGNED: ; %bb.0: -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0 @@ -689,15 +687,15 @@ ; ; GFX9-UNALIGNED-LABEL: unaligned_offset_read2_f32: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-UNALIGNED-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_add3_u32 v0, s4, v2, 5 +; GFX9-UNALIGNED-NEXT: v_add3_u32 v0, s2, v2, 5 ; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[2:3] +; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-UNALIGNED-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %base = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i @@ -717,38 +715,37 @@ define amdgpu_kernel void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { ; CI-LABEL: misaligned_2_simple_read2_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; CI-NEXT: s_load_dword s0, s[0:1], 0xb +; CI-NEXT: s_load_dword s2, s[0:1], 0xb ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v1, vcc, s0, v0 -; CI-NEXT: ds_read_u16 v2, v1 offset:34 -; CI-NEXT: ds_read_u16 v3, v1 offset:32 -; CI-NEXT: ds_read_u16 v4, v1 offset:2 -; CI-NEXT: ds_read_u16 v1, v1 -; CI-NEXT: s_waitcnt lgkmcnt(3) -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0 +; CI-NEXT: ds_read_u16 v2, v1 offset:32 +; CI-NEXT: ds_read_u16 v3, v1 offset:2 +; CI-NEXT: ds_read_u16 v4, v1 +; CI-NEXT: ds_read_u16 v1, v1 offset:34 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(2) -; CI-NEXT: v_or_b32_e32 v2, v2, v3 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: s_waitcnt lgkmcnt(1) -; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; CI-NEXT: v_or_b32_e32 v3, v3, v4 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_or_b32_e32 v1, v4, v1 -; CI-NEXT: v_add_f32_e32 v2, v1, v2 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v1, v1, v2 +; CI-NEXT: v_add_f32_e32 v2, v3, v1 ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm ; ; GFX9-ALIGNED-LABEL: misaligned_2_simple_read2_f32: ; GFX9-ALIGNED: ; %bb.0: -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-ALIGNED-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0 +; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s2, v0 ; GFX9-ALIGNED-NEXT: ds_read_u16 v2, v1 ; GFX9-ALIGNED-NEXT: ds_read_u16 v3, v1 offset:2 ; GFX9-ALIGNED-NEXT: ds_read_u16 v4, v1 offset:32 @@ -758,20 +755,20 @@ ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v4 ; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-ALIGNED-NEXT: s_endpgm ; ; GFX9-UNALIGNED-LABEL: misaligned_2_simple_read2_f32: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-UNALIGNED-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s4, v2 +; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s2, v2 ; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:8 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[2:3] +; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-UNALIGNED-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i @@ -1182,22 +1179,22 @@ define amdgpu_kernel void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(3)* %in) #0 { ; CI-LABEL: misaligned_read2_v2i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; CI-NEXT: s_load_dword s0, s[0:1], 0xb +; CI-NEXT: s_load_dword s2, s[0:1], 0xb +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: misaligned_read2_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -1213,22 +1210,22 @@ define amdgpu_kernel void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) #0 { ; CI-LABEL: misaligned_read2_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; CI-NEXT: s_load_dword s0, s[0:1], 0xb +; CI-NEXT: s_load_dword s2, s[0:1], 0xb +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: misaligned_read2_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -1244,41 +1241,41 @@ define amdgpu_kernel void @ds_read_diff_base_interleaving( ; CI-LABEL: ds_read_diff_base_interleaving: ; CI: ; %bb.0: ; %bb -; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v1 -; CI-NEXT: v_add_i32_e32 v4, vcc, s1, v0 -; CI-NEXT: v_add_i32_e32 v3, vcc, s2, v1 -; CI-NEXT: v_add_i32_e32 v6, vcc, s3, v0 +; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v1 +; CI-NEXT: v_add_i32_e32 v3, vcc, s5, v4 +; CI-NEXT: v_add_i32_e32 v5, vcc, s6, v1 ; CI-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 -; CI-NEXT: ds_read2_b32 v[2:3], v3 offset1:1 -; CI-NEXT: ds_read2_b32 v[4:5], v4 offset1:4 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mul_f32_e32 v0, v0, v4 -; CI-NEXT: v_add_f32_e32 v4, 2.0, v0 -; CI-NEXT: v_mul_f32_e32 v5, v1, v5 -; CI-NEXT: ds_read2_b32 v[0:1], v6 offset1:4 +; CI-NEXT: ds_read2_b32 v[2:3], v3 offset1:4 +; CI-NEXT: v_add_i32_e32 v6, vcc, s7, v4 +; CI-NEXT: ds_read2_b32 v[4:5], v5 offset1:1 +; CI-NEXT: ds_read2_b32 v[6:7], v6 offset1:4 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_waitcnt lgkmcnt(2) +; CI-NEXT: v_mul_f32_e32 v0, v0, v2 +; CI-NEXT: v_add_f32_e32 v0, 2.0, v0 +; CI-NEXT: v_mul_f32_e32 v1, v1, v3 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mul_f32_e32 v0, v2, v0 -; CI-NEXT: v_sub_f32_e32 v0, v4, v0 -; CI-NEXT: v_sub_f32_e32 v0, v0, v5 -; CI-NEXT: v_mul_f32_e32 v1, v3, v1 +; CI-NEXT: v_mul_f32_e32 v2, v4, v6 +; CI-NEXT: v_sub_f32_e32 v0, v0, v2 +; CI-NEXT: v_sub_f32_e32 v0, v0, v1 +; CI-NEXT: v_mul_f32_e32 v1, v5, v7 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_sub_f32_e32 v0, v0, v1 -; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:40 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:40 ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: ds_read_diff_base_interleaving: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v2, s4, v1 @@ -1299,7 +1296,7 @@ ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, v5, v7 ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX9-NEXT: global_store_dword v8, v0, s[2:3] offset:40 +; GFX9-NEXT: global_store_dword v8, v0, s[0:1] offset:40 ; GFX9-NEXT: s_endpgm float addrspace(1)* nocapture %arg, [4 x [4 x float]] addrspace(3)* %arg1, @@ -1344,8 +1341,8 @@ ; CI-NEXT: s_getpc_b64 s[40:41] ; CI-NEXT: s_mov_b32 s40, s0 ; CI-NEXT: s_load_dwordx4 s[40:43], s[40:41], 0x0 -; CI-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; CI-NEXT: s_load_dword s0, s[4:5], 0xb +; CI-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_mov_b32 s32, 0 @@ -1376,27 +1373,27 @@ ; GFX9-NEXT: s_mov_b32 s36, s0 ; GFX9-NEXT: s_load_dwordx4 s[36:39], s[36:37], 0x0 ; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x2c ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: v_mov_b32_e32 v42, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_lshl_add_u32 v40, v0, 2, s2 ; GFX9-NEXT: s_add_u32 s36, s36, s11 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: v_lshl_add_u32 v41, v0, 2, s0 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, void_func_void@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, void_func_void@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: ds_read_b32 v42, v41 +; GFX9-NEXT: ds_read_b32 v41, v40 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: ds_read_b32 v0, v41 offset:4 +; GFX9-NEXT: ds_read_b32 v0, v40 offset:4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v0, v42, v0 -; GFX9-NEXT: global_store_dword v40, v0, s[34:35] +; GFX9-NEXT: v_add_u32_e32 v0, v41, v0 +; GFX9-NEXT: global_store_dword v42, v0, s[34:35] ; GFX9-NEXT: s_endpgm %x = call i32 @llvm.amdgcn.workitem.id.x() %arrayidx0 = getelementptr i32, i32 addrspace(3)* %arg, i32 %x @@ -1449,33 +1446,33 @@ ; CI: ; %bb.0: ; %entry ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: ds_read_u8 v1, v0 offset:72 -; CI-NEXT: ds_read_u8 v2, v0 offset:71 -; CI-NEXT: ds_read_u8 v3, v0 offset:70 +; CI-NEXT: ds_read_u8 v1, v0 offset:70 +; CI-NEXT: ds_read_u8 v2, v0 offset:72 +; CI-NEXT: ds_read_u8 v3, v0 offset:71 ; CI-NEXT: ds_read_u8 v4, v0 offset:69 ; CI-NEXT: ds_read_u8 v5, v0 offset:68 ; CI-NEXT: s_waitcnt lgkmcnt(4) ; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; CI-NEXT: s_waitcnt lgkmcnt(3) -; CI-NEXT: v_or_b32_e32 v1, v1, v2 +; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; CI-NEXT: s_waitcnt lgkmcnt(2) -; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: s_waitcnt lgkmcnt(1) -; CI-NEXT: v_or_b32_e32 v3, v3, v4 -; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_or_b32_e32 v1, v1, v3 -; CI-NEXT: ds_read_u8 v2, v0 offset:67 -; CI-NEXT: ds_read_u8 v3, v0 offset:66 +; CI-NEXT: v_or_b32_e32 v1, v1, v4 +; CI-NEXT: ds_read_u8 v4, v0 offset:67 +; CI-NEXT: ds_read_u8 v6, v0 offset:66 ; CI-NEXT: ds_read_u8 v0, v0 offset:65 +; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: v_or_b32_e32 v1, v2, v1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; CI-NEXT: v_or_b32_e32 v0, v3, v0 -; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v5 -; CI-NEXT: v_or_b32_e32 v2, v3, v2 +; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 +; CI-NEXT: v_or_b32_e32 v0, v2, v0 +; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 +; CI-NEXT: v_or_b32_e32 v2, v2, v4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_or_b32_e32 v0, v2, v0 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm @@ -1483,25 +1480,26 @@ ; GFX9-ALIGNED-LABEL: read2_v2i32_align1_odd_offset: ; GFX9-ALIGNED: ; %bb.0: ; %entry ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-ALIGNED-NEXT: ds_read_u8 v0, v2 offset:65 -; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v2 offset:66 -; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v2 offset:67 -; GFX9-ALIGNED-NEXT: ds_read_u8 v5, v2 offset:68 +; GFX9-ALIGNED-NEXT: ds_read_u8 v0, v2 offset:70 +; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v2 offset:65 +; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v2 offset:66 +; GFX9-ALIGNED-NEXT: ds_read_u8 v5, v2 offset:67 +; GFX9-ALIGNED-NEXT: ds_read_u8 v6, v2 offset:68 ; GFX9-ALIGNED-NEXT: ds_read_u8 v1, v2 offset:69 -; GFX9-ALIGNED-NEXT: ds_read_u8 v6, v2 offset:70 -; GFX9-ALIGNED-NEXT: ds_read_u8 v7, v2 offset:71 -; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v2 offset:72 +; GFX9-ALIGNED-NEXT: ds_read_u8 v7, v2 offset:72 +; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v2 offset:71 +; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(7) +; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GFX9-ALIGNED-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v6, 8, v8 -; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v5 -; GFX9-ALIGNED-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-ALIGNED-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-ALIGNED-NEXT: v_or_b32_e32 v1, v6, v1 +; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; GFX9-ALIGNED-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-ALIGNED-NEXT: v_or_b32_e32 v1, v1, v0 +; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 8, v4 +; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v6 +; GFX9-ALIGNED-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v3, v0 ; GFX9-ALIGNED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-ALIGNED-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -85,18 +85,18 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { ; CI-LABEL: simple_write2_two_val_f32_volatile_0: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[0:1], s[4:5] -; CI-NEXT: s_mov_b64 s[4:5], s[6:7] -; CI-NEXT: s_mov_b64 s[6:7], s[2:3] -; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; CI-NEXT: s_mov_b64 s[4:5], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[2:3] +; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 glc +; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_write_b32 v0, v2 @@ -131,18 +131,18 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { ; CI-LABEL: simple_write2_two_val_f32_volatile_1: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[0:1], s[4:5] -; CI-NEXT: s_mov_b64 s[4:5], s[6:7] -; CI-NEXT: s_mov_b64 s[6:7], s[2:3] -; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; CI-NEXT: s_mov_b64 s[4:5], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[2:3] +; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 glc +; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_write_b32 v0, v2 @@ -348,17 +348,17 @@ define amdgpu_kernel void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { ; CI-LABEL: simple_write2_two_val_too_far_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[0:1], s[4:5] -; CI-NEXT: s_mov_b64 s[4:5], s[6:7] -; CI-NEXT: s_mov_b64 s[6:7], s[2:3] -; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: s_mov_b64 s[4:5], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[2:3] +; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: ds_write_b32 v0, v2 @@ -394,17 +394,17 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { ; CI-LABEL: simple_write2_two_val_f32_x2: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[0:1], s[4:5] -; CI-NEXT: s_mov_b64 s[4:5], s[6:7] -; CI-NEXT: s_mov_b64 s[6:7], s[2:3] -; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: s_mov_b64 s[4:5], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[2:3] +; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:8 @@ -450,17 +450,17 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { ; CI-LABEL: simple_write2_two_val_f32_x2_nonzero_base: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[0:1], s[4:5] -; CI-NEXT: s_mov_b64 s[4:5], s[6:7] -; CI-NEXT: s_mov_b64 s[6:7], s[2:3] -; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: s_mov_b64 s[4:5], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[2:3] +; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: ds_write2_b32 v0, v2, v1 offset0:3 offset1:8 @@ -653,8 +653,8 @@ ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: ds_write_b8 v0, v1 offset:5 ; CI-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; CI-NEXT: ds_write_b8 v0, v1 offset:5 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; CI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; CI-NEXT: ds_write_b8 v0, v2 offset:13 @@ -874,18 +874,18 @@ ; CI-LABEL: write2_sgemm_sequence: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; CI-NEXT: s_lshl_b32 s2, s2, 2 -; CI-NEXT: s_add_i32 s3, s2, 0xc20 -; CI-NEXT: v_mov_b32_e32 v0, s3 -; CI-NEXT: s_addk_i32 s2, 0xc60 +; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s0, s[0:1], 0x0 -; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_lshl_b32 s1, s2, 2 +; CI-NEXT: s_add_i32 s2, s1, 0xc20 +; CI-NEXT: s_addk_i32 s1, 0xc60 +; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: v_mov_b32_e32 v3, s0 ; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 -; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v0, s1 ; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v1 ; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 @@ -897,11 +897,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_lshl_b32 s2, s2, 2 -; GFX9-NEXT: s_add_i32 s3, s2, 0xc20 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: s_addk_i32 s2, 0xc60 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-NEXT: s_add_i32 s1, s2, 0xc20 +; GFX9-NEXT: s_addk_i32 s2, 0xc60 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s0 @@ -950,12 +950,12 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 { ; CI-LABEL: simple_write2_v4f32_superreg_align4: ; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; CI-NEXT: s_load_dword s4, s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; CI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s0 @@ -968,8 +968,8 @@ ; ; GFX9-ALIGNED-LABEL: simple_write2_v4f32_superreg_align4: ; GFX9-ALIGNED: ; %bb.0: -; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x24 ; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x24 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s4 ; GFX9-ALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 @@ -984,8 +984,8 @@ ; ; GFX9-UNALIGNED-LABEL: simple_write2_v4f32_superreg_align4: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x24 ; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x24 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_lshl_add_u32 v4, v0, 4, s4 ; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll @@ -38,9 +38,9 @@ } ; GCN-LABEL: {{^}}extract_vector_elt_v2i16_dynamic_vgpr: -; GCN-DAG: s_load_dword [[VEC:s[0-9]+]] ; GCN-DAG: {{flat|buffer|global}}_load_dword [[IDX:v[0-9]+]] -; GCN: v_lshlrev_b32_e32 [[IDX_SCALED:v[0-9]+]], 4, [[IDX]] +; GCN-DAG: v_lshlrev_b32_e32 [[IDX_SCALED:v[0-9]+]], 4, [[IDX]] +; GCN-DAG: s_load_dword [[VEC:s[0-9]+]] ; SI: v_lshr_b32_e32 [[ELT:v[0-9]+]], [[VEC]], [[IDX_SCALED]] ; VI: v_lshrrev_b32_e64 [[ELT:v[0-9]+]], [[IDX_SCALED]], [[VEC]] diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll @@ -133,8 +133,8 @@ ; isTypeDesirableForOp in SimplifyDemandedBits ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v2i8: -; VI: s_load_dword [[LOAD:s[0-9]+]], s[4:5], 0x28 -; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x4c +; VI: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x4c +; VI-NEXT: s_load_dword [[LOAD:s[0-9]+]], s[4:5], 0x28 ; VI-NOT: {{flat|buffer|global}} ; VI-DAG: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]] ; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 @@ -147,8 +147,8 @@ } ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i8: -; VI: s_load_dword [[LOAD:s[0-9]+]], s[4:5], 0x28 -; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x4c +; VI: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x4c +; VI-NEXT: s_load_dword [[LOAD:s[0-9]+]], s[4:5], 0x28 ; VI-NOT: {{flat|buffer|global}} ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 ; VI: s_lshr_b32 [[ELT:s[0-9]+]], [[LOAD]], [[SCALED_IDX]] diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll --- a/llvm/test/CodeGen/AMDGPU/fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global,-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs -enable-misched=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global,-xnack -enable-misched=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s @@ -11,8 +11,7 @@ ; R600-NOT: AND ; R600: |PV.{{[XYZW]}}| -; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff -; VI: s_bitset0_b32 s{{[0-9]+}}, 31 +; GCN: s_bitset0_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @s_fabs_fn_free(float addrspace(1)* %out, i32 %in) { %bc= bitcast i32 %in to float %fabs = call float @fabs(float %bc) @@ -24,8 +23,7 @@ ; R600-NOT: AND ; R600: |PV.{{[XYZW]}}| -; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff -; VI: s_bitset0_b32 s{{[0-9]+}}, 31 +; GCN: s_bitset0_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @s_fabs_free(float addrspace(1)* %out, i32 %in) { %bc= bitcast i32 %in to float %fabs = call float @llvm.fabs.f32(float %bc) @@ -36,8 +34,7 @@ ; FUNC-LABEL: {{^}}s_fabs_f32: ; R600: |{{(PV|T[0-9])\.[XYZW]}}| -; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff -; VI: s_bitset0_b32 s{{[0-9]+}}, 31 +; GCN: s_bitset0_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @s_fabs_f32(float addrspace(1)* %out, float %in) { %fabs = call float @llvm.fabs.f32(float %in) store float %fabs, float addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -11,10 +11,11 @@ ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v0 ; GFX7-ALIGNED-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX7-ALIGNED-NEXT: flat_load_ushort v2, v[2:3] ; GFX7-ALIGNED-NEXT: flat_load_ushort v0, v[0:1] -; GFX7-ALIGNED-NEXT: flat_load_ushort v1, v[2:3] +; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(1) +; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] ; @@ -23,10 +24,11 @@ ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v0 ; GFX7-UNALIGNED-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX7-UNALIGNED-NEXT: flat_load_ushort v2, v[2:3] ; GFX7-UNALIGNED-NEXT: flat_load_ushort v0, v[0:1] -; GFX7-UNALIGNED-NEXT: flat_load_ushort v1, v[2:3] +; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(1) +; GFX7-UNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX7-UNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-UNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] ; @@ -125,25 +127,26 @@ ; GFX7-ALIGNED-LABEL: global_load_2xi16_align1: ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v0 +; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; GFX7-ALIGNED-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX7-ALIGNED-NEXT: v_add_i32_e32 v4, vcc, 1, v0 -; GFX7-ALIGNED-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX7-ALIGNED-NEXT: v_add_i32_e32 v6, vcc, 3, v0 -; GFX7-ALIGNED-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; GFX7-ALIGNED-NEXT: flat_load_ubyte v0, v[0:1] -; GFX7-ALIGNED-NEXT: flat_load_ubyte v1, v[6:7] -; GFX7-ALIGNED-NEXT: flat_load_ubyte v4, v[4:5] +; GFX7-ALIGNED-NEXT: flat_load_ubyte v4, v[2:3] +; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; GFX7-ALIGNED-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX7-ALIGNED-NEXT: flat_load_ubyte v5, v[0:1] ; GFX7-ALIGNED-NEXT: flat_load_ubyte v2, v[2:3] +; GFX7-ALIGNED-NEXT: v_add_i32_e32 v0, vcc, 2, v0 +; GFX7-ALIGNED-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-ALIGNED-NEXT: flat_load_ubyte v0, v[0:1] +; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(3) +; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v4 ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-ALIGNED-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v4 +; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX7-ALIGNED-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-UNALIGNED-LABEL: global_load_2xi16_align1: diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll @@ -12,10 +12,11 @@ ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-ALIGNED-NEXT: v_add_i32_e32 v1, vcc, 2, v0 -; GFX7-ALIGNED-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen ; GFX7-ALIGNED-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen -; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX7-ALIGNED-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen +; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(1) ; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] ; @@ -23,10 +24,11 @@ ; GFX7-UNALIGNED: ; %bb.0: ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: v_add_i32_e32 v1, vcc, 2, v0 -; GFX7-UNALIGNED-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen ; GFX7-UNALIGNED-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen -; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX7-UNALIGNED-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen +; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(1) ; GFX7-UNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-UNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] ; @@ -157,20 +159,21 @@ ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-ALIGNED-NEXT: v_add_i32_e32 v1, vcc, 2, v0 ; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX7-ALIGNED-NEXT: buffer_load_ubyte v3, v0, s[0:3], 0 offen -; GFX7-ALIGNED-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GFX7-ALIGNED-NEXT: buffer_load_ubyte v0, v0, s[0:3], 0 offen +; GFX7-ALIGNED-NEXT: v_add_i32_e32 v3, vcc, 3, v0 ; GFX7-ALIGNED-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen +; GFX7-ALIGNED-NEXT: buffer_load_ubyte v3, v3, s[0:3], 0 offen +; GFX7-ALIGNED-NEXT: buffer_load_ubyte v0, v0, s[0:3], 0 offen ; GFX7-ALIGNED-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen +; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(3) +; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX7-ALIGNED-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-ALIGNED-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-UNALIGNED-LABEL: private_load_2xi16_align1: diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll --- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll @@ -7,23 +7,23 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmax_legacy_uge_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_nlt_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmax_legacy_uge_f64: @@ -59,23 +59,23 @@ define amdgpu_kernel void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmax_legacy_oge_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ge_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmax_legacy_oge_f64: @@ -111,23 +111,23 @@ define amdgpu_kernel void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmax_legacy_ugt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_nle_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmax_legacy_ugt_f64: @@ -163,23 +163,23 @@ define amdgpu_kernel void @test_fmax_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmax_legacy_ogt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmax_legacy_ogt_f64: diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll --- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll @@ -5,23 +5,23 @@ define amdgpu_kernel void @test_fmin_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmin_legacy_uge_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_nlt_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmin_legacy_uge_f64: @@ -57,23 +57,23 @@ define amdgpu_kernel void @test_fmin_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmin_legacy_ugt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_nle_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmin_legacy_ugt_f64: @@ -109,23 +109,23 @@ define amdgpu_kernel void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmin_legacy_ule_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ngt_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmin_legacy_ule_f64: @@ -161,23 +161,23 @@ define amdgpu_kernel void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmin_legacy_ult_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_nge_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmin_legacy_ult_f64: @@ -213,23 +213,23 @@ define amdgpu_kernel void @test_fmin_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmin_legacy_oge_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ge_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmin_legacy_oge_f64: @@ -265,23 +265,23 @@ define amdgpu_kernel void @test_fmin_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmin_legacy_ogt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmin_legacy_ogt_f64: @@ -317,23 +317,23 @@ define amdgpu_kernel void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmin_legacy_ole_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_le_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmin_legacy_ole_f64: @@ -369,23 +369,23 @@ define amdgpu_kernel void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmin_legacy_olt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmin_legacy_olt_f64: diff --git a/llvm/test/CodeGen/AMDGPU/fminnum.f64.ll b/llvm/test/CodeGen/AMDGPU/fminnum.f64.ll --- a/llvm/test/CodeGen/AMDGPU/fminnum.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fminnum.f64.ll @@ -15,7 +15,7 @@ ; GCN-DAG: v_max_f64 [[QUIETA:v\[[0-9]+:[0-9]+\]]], [[A]], [[A]] ; GCN-DAG: v_max_f64 [[QUIETB:v\[[0-9]+:[0-9]+\]]], [[B]], [[B]] -; GCN: v_min_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[QUIETA]], [[QUIETB]] +; GCN: v_min_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[QUIETB]], [[QUIETA]] define amdgpu_kernel void @test_fmin_f64_ieee_noflush([8 x i32], double %a, [8 x i32], double %b) #1 { %val = call double @llvm.minnum.f64(double %a, double %b) #0 store double %val, double addrspace(1)* undef, align 8 @@ -31,7 +31,7 @@ ; GFX9-DAG: v_max_f64 [[QUIETA:v\[[0-9]+:[0-9]+\]]], [[A]], [[A]] ; GFX9-DAG: v_max_f64 [[QUIETB:v\[[0-9]+:[0-9]+\]]], [[B]], [[B]] -; GCN: v_min_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[QUIETA]], [[QUIETB]] +; GCN: v_min_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[QUIETB]], [[QUIETA]] define amdgpu_kernel void @test_fmin_f64_ieee_flush([8 x i32], double %a, [8 x i32], double %b) #2 { %val = call double @llvm.minnum.f64(double %a, double %b) #0 store double %val, double addrspace(1)* undef, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-atomics.ll --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-atomics.ll @@ -22,22 +22,22 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { ; SI-LABEL: raw_buffer_atomic_min_noret_f32: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen ; SI-NEXT: s_endpgm ; ; GFX7-LABEL: raw_buffer_atomic_min_noret_f32: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: raw_buffer_atomic_min_noret_f32: @@ -64,22 +64,22 @@ ; ; G_SI-LABEL: raw_buffer_atomic_min_noret_f32: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; G_SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: v_mov_b32_e32 v0, s0 -; G_SI-NEXT: v_mov_b32_e32 v1, s1 -; G_SI-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen glc +; G_SI-NEXT: v_mov_b32_e32 v0, s4 +; G_SI-NEXT: v_mov_b32_e32 v1, s5 +; G_SI-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc ; G_SI-NEXT: s_endpgm ; ; G_GFX7-LABEL: raw_buffer_atomic_min_noret_f32: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; G_GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s1 -; G_GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen glc +; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 +; G_GFX7-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc ; G_GFX7-NEXT: s_endpgm ; ; G_GFX10-LABEL: raw_buffer_atomic_min_noret_f32: @@ -111,26 +111,26 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { ; SI-LABEL: raw_buffer_atomic_min_noret_f64: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; SI-NEXT: s_load_dword s0, s[0:1], 0xf +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xf +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_mov_b32_e32 v2, s0 -; SI-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen ; SI-NEXT: s_endpgm ; ; GFX7-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GFX7-NEXT: s_load_dword s0, s[0:1], 0xf +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_load_dword s6, s[0:1], 0xf +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: raw_buffer_atomic_min_noret_f64: @@ -161,26 +161,26 @@ ; ; G_SI-LABEL: raw_buffer_atomic_min_noret_f64: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; G_SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; G_SI-NEXT: s_load_dword s0, s[0:1], 0xf +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_SI-NEXT: s_load_dword s6, s[0:1], 0xf +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: v_mov_b32_e32 v0, s2 -; G_SI-NEXT: v_mov_b32_e32 v1, s3 -; G_SI-NEXT: v_mov_b32_e32 v2, s0 -; G_SI-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen glc +; G_SI-NEXT: v_mov_b32_e32 v0, s4 +; G_SI-NEXT: v_mov_b32_e32 v1, s5 +; G_SI-NEXT: v_mov_b32_e32 v2, s6 +; G_SI-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc ; G_SI-NEXT: s_endpgm ; ; G_GFX7-LABEL: raw_buffer_atomic_min_noret_f64: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; G_GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; G_GFX7-NEXT: s_load_dword s0, s[0:1], 0xf +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dword s6, s[0:1], 0xf +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s3 -; G_GFX7-NEXT: v_mov_b32_e32 v2, s0 -; G_GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen glc +; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 +; G_GFX7-NEXT: v_mov_b32_e32 v2, s6 +; G_GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc ; G_GFX7-NEXT: s_endpgm ; ; G_GFX10-LABEL: raw_buffer_atomic_min_noret_f64: @@ -352,8 +352,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex, float addrspace(3)* %out) { ; SI-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_load_dword s0, s[0:1], 0xf ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -367,8 +367,8 @@ ; ; GFX7-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0xf ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -413,8 +413,8 @@ ; ; G_SI-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; G_SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; G_SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; G_SI-NEXT: s_load_dword s0, s[0:1], 0xf ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) @@ -428,8 +428,8 @@ ; ; G_GFX7-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; G_GFX7-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; G_GFX7-NEXT: s_load_dword s2, s[0:1], 0xf ; G_GFX7-NEXT: s_mov_b32 m0, -1 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -546,22 +546,22 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { ; SI-LABEL: raw_buffer_atomic_max_noret_f32: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen ; SI-NEXT: s_endpgm ; ; GFX7-LABEL: raw_buffer_atomic_max_noret_f32: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: raw_buffer_atomic_max_noret_f32: @@ -588,22 +588,22 @@ ; ; G_SI-LABEL: raw_buffer_atomic_max_noret_f32: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; G_SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: v_mov_b32_e32 v0, s0 -; G_SI-NEXT: v_mov_b32_e32 v1, s1 -; G_SI-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen glc +; G_SI-NEXT: v_mov_b32_e32 v0, s4 +; G_SI-NEXT: v_mov_b32_e32 v1, s5 +; G_SI-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc ; G_SI-NEXT: s_endpgm ; ; G_GFX7-LABEL: raw_buffer_atomic_max_noret_f32: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; G_GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s1 -; G_GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen glc +; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 +; G_GFX7-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc ; G_GFX7-NEXT: s_endpgm ; ; G_GFX10-LABEL: raw_buffer_atomic_max_noret_f32: @@ -635,26 +635,26 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { ; SI-LABEL: raw_buffer_atomic_max_noret_f64: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; SI-NEXT: s_load_dword s0, s[0:1], 0xf +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xf +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_mov_b32_e32 v2, s0 -; SI-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen ; SI-NEXT: s_endpgm ; ; GFX7-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GFX7-NEXT: s_load_dword s0, s[0:1], 0xf +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_load_dword s6, s[0:1], 0xf +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: raw_buffer_atomic_max_noret_f64: @@ -685,26 +685,26 @@ ; ; G_SI-LABEL: raw_buffer_atomic_max_noret_f64: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; G_SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; G_SI-NEXT: s_load_dword s0, s[0:1], 0xf +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_SI-NEXT: s_load_dword s6, s[0:1], 0xf +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: v_mov_b32_e32 v0, s2 -; G_SI-NEXT: v_mov_b32_e32 v1, s3 -; G_SI-NEXT: v_mov_b32_e32 v2, s0 -; G_SI-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen glc +; G_SI-NEXT: v_mov_b32_e32 v0, s4 +; G_SI-NEXT: v_mov_b32_e32 v1, s5 +; G_SI-NEXT: v_mov_b32_e32 v2, s6 +; G_SI-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc ; G_SI-NEXT: s_endpgm ; ; G_GFX7-LABEL: raw_buffer_atomic_max_noret_f64: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; G_GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; G_GFX7-NEXT: s_load_dword s0, s[0:1], 0xf +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dword s6, s[0:1], 0xf +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s3 -; G_GFX7-NEXT: v_mov_b32_e32 v2, s0 -; G_GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen glc +; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 +; G_GFX7-NEXT: v_mov_b32_e32 v2, s6 +; G_GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc ; G_GFX7-NEXT: s_endpgm ; ; G_GFX10-LABEL: raw_buffer_atomic_max_noret_f64: @@ -876,30 +876,30 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex, float addrspace(1)* %out) { ; SI-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; SI: ; %bb.0: ; %main_body +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; GFX7-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX7: ; %bb.0: ; %main_body +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -936,8 +936,8 @@ ; ; G_SI-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; G_SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; G_SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; G_SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s2 @@ -951,8 +951,8 @@ ; ; G_GFX7-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; G_GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; G_GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s2 @@ -1002,9 +1002,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(3)* %out) { ; SI-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; SI-NEXT: s_load_dword s8, s[0:1], 0xf +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_load_dword s0, s[0:1], 0x10 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1019,16 +1019,16 @@ ; ; GFX7-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0xf +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc -; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc +; GFX7-NEXT: v_mov_b32_e32 v2, s7 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b64 v2, v[0:1] ; GFX7-NEXT: s_endpgm @@ -1067,33 +1067,33 @@ ; ; G_SI-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; G_SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; G_SI-NEXT: s_load_dword s8, s[0:1], 0x10 -; G_SI-NEXT: s_load_dword s0, s[0:1], 0xf +; G_SI-NEXT: s_load_dword s8, s[0:1], 0xf +; G_SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; G_SI-NEXT: s_load_dword s0, s[0:1], 0x10 ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s2 ; G_SI-NEXT: v_mov_b32_e32 v1, s3 -; G_SI-NEXT: v_mov_b32_e32 v2, s0 -; G_SI-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc ; G_SI-NEXT: v_mov_b32_e32 v2, s8 +; G_SI-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc +; G_SI-NEXT: v_mov_b32_e32 v2, s0 ; G_SI-NEXT: s_waitcnt vmcnt(0) ; G_SI-NEXT: ds_write_b64 v2, v[0:1] ; G_SI-NEXT: s_endpgm ; ; G_GFX7-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; G_GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; G_GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0xf +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_GFX7-NEXT: s_mov_b32 m0, -1 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s3 -; G_GFX7-NEXT: v_mov_b32_e32 v2, s0 -; G_GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc -; G_GFX7-NEXT: v_mov_b32_e32 v2, s1 +; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 +; G_GFX7-NEXT: v_mov_b32_e32 v2, s6 +; G_GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc +; G_GFX7-NEXT: v_mov_b32_e32 v2, s7 ; G_GFX7-NEXT: s_waitcnt vmcnt(0) ; G_GFX7-NEXT: ds_write_b64 v2, v[0:1] ; G_GFX7-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -19,9 +19,9 @@ define amdgpu_kernel void @buffer_atomic_add_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 @@ -48,15 +48,15 @@ define amdgpu_kernel void @buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) { ; GFX90A-LABEL: buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, s10 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v3, s[4:7], 0 idxen offset:4 glc slc +; GFX90A-NEXT: v_mov_b32_e32 v2, s10 +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm @@ -69,9 +69,9 @@ define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 @@ -98,15 +98,15 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) { ; GFX90A-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, s10 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v3, s[4:7], 4 offen glc slc +; GFX90A-NEXT: v_mov_b32_e32 v2, s10 +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen glc slc +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm @@ -119,9 +119,9 @@ define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 @@ -148,15 +148,15 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) { ; GFX90A-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, s10 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v3, s[4:7], 0 idxen offset:4 glc slc +; GFX90A-NEXT: v_mov_b32_e32 v2, s10 +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm @@ -169,9 +169,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 @@ -198,15 +198,15 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) { ; GFX90A-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, s10 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v3, s[4:7], 4 offen glc slc +; GFX90A-NEXT: v_mov_b32_e32 v2, s10 +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen glc slc +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm @@ -219,9 +219,9 @@ define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 @@ -248,15 +248,15 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) { ; GFX90A-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, s10 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v3, s[4:7], 0 idxen offset:4 glc slc +; GFX90A-NEXT: v_mov_b32_e32 v2, s10 +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm @@ -269,9 +269,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 @@ -298,15 +298,15 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) { ; GFX90A-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, s10 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v3, s[4:7], 4 offen glc slc +; GFX90A-NEXT: v_mov_b32_e32 v2, s10 +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen glc slc +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm @@ -319,9 +319,9 @@ define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 @@ -348,15 +348,15 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) { ; GFX90A-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, s10 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v3, s[4:7], 0 idxen offset:4 glc slc +; GFX90A-NEXT: v_mov_b32_e32 v2, s10 +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll --- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll @@ -214,25 +214,25 @@ ; ; VI-LABEL: fp_to_sint_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s2, 0x2f800000 -; VI-NEXT: s_mov_b32 s3, 0xcf800000 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s4, 0x2f800000 +; VI-NEXT: s_mov_b32 s5, 0xcf800000 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_trunc_f32_e32 v0, s0 -; VI-NEXT: v_mul_f32_e64 v1, |v0|, s2 +; VI-NEXT: v_trunc_f32_e32 v0, s2 +; VI-NEXT: v_mul_f32_e64 v1, |v0|, s4 ; VI-NEXT: v_floor_f32_e32 v1, v1 -; VI-NEXT: v_fma_f32 v2, v1, s3, |v0| +; VI-NEXT: v_fma_f32 v2, v1, s5, |v0| ; VI-NEXT: v_cvt_u32_f32_e32 v2, v2 ; VI-NEXT: v_cvt_u32_f32_e32 v1, v1 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_xor_b32_e32 v0, v2, v3 ; VI-NEXT: v_xor_b32_e32 v1, v1, v3 ; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v3 ; VI-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: fp_to_sint_i64: @@ -326,24 +326,23 @@ ; ; VI-LABEL: fp_to_sint_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s2, 0x2f800000 -; VI-NEXT: s_mov_b32 s3, 0xcf800000 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s6, 0x2f800000 +; VI-NEXT: s_mov_b32 s7, 0xcf800000 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_trunc_f32_e32 v0, s1 -; VI-NEXT: v_mul_f32_e64 v1, |v0|, s2 +; VI-NEXT: v_trunc_f32_e32 v0, s5 +; VI-NEXT: v_mul_f32_e64 v1, |v0|, s6 ; VI-NEXT: v_floor_f32_e32 v1, v1 -; VI-NEXT: v_fma_f32 v2, v1, s3, |v0| -; VI-NEXT: v_trunc_f32_e32 v4, s0 +; VI-NEXT: v_fma_f32 v2, v1, s7, |v0| +; VI-NEXT: v_trunc_f32_e32 v4, s4 ; VI-NEXT: v_cvt_u32_f32_e32 v2, v2 -; VI-NEXT: v_mul_f32_e64 v3, |v4|, s2 +; VI-NEXT: v_mul_f32_e64 v3, |v4|, s6 ; VI-NEXT: v_cvt_u32_f32_e32 v1, v1 ; VI-NEXT: v_floor_f32_e32 v3, v3 ; VI-NEXT: v_cvt_u32_f32_e32 v5, v3 -; VI-NEXT: v_fma_f32 v3, v3, s3, |v4| +; VI-NEXT: v_fma_f32 v3, v3, s7, |v4| ; VI-NEXT: v_ashrrev_i32_e32 v0, 31, v0 ; VI-NEXT: v_cvt_u32_f32_e32 v6, v3 ; VI-NEXT: v_xor_b32_e32 v2, v2, v0 @@ -354,8 +353,9 @@ ; VI-NEXT: v_xor_b32_e32 v0, v6, v1 ; VI-NEXT: v_xor_b32_e32 v4, v5, v1 ; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: fp_to_sint_v2i64: @@ -506,19 +506,18 @@ ; ; VI-LABEL: fp_to_sint_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s8, 0x2f800000 ; VI-NEXT: s_mov_b32 s9, 0xcf800000 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_trunc_f32_e32 v0, s1 +; VI-NEXT: v_trunc_f32_e32 v0, s5 ; VI-NEXT: v_mul_f32_e64 v1, |v0|, s8 ; VI-NEXT: v_floor_f32_e32 v1, v1 ; VI-NEXT: v_fma_f32 v2, v1, s9, |v0| ; VI-NEXT: v_cvt_u32_f32_e32 v2, v2 -; VI-NEXT: v_trunc_f32_e32 v4, s0 +; VI-NEXT: v_trunc_f32_e32 v4, s4 ; VI-NEXT: v_cvt_u32_f32_e32 v1, v1 ; VI-NEXT: v_mul_f32_e64 v3, |v4|, s8 ; VI-NEXT: v_floor_f32_e32 v3, v3 @@ -532,7 +531,7 @@ ; VI-NEXT: v_subb_u32_e32 v3, vcc, v1, v0, vcc ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v4 ; VI-NEXT: v_xor_b32_e32 v4, v5, v1 -; VI-NEXT: v_trunc_f32_e32 v5, s3 +; VI-NEXT: v_trunc_f32_e32 v5, s7 ; VI-NEXT: v_xor_b32_e32 v0, v6, v1 ; VI-NEXT: v_mul_f32_e64 v6, |v5|, s8 ; VI-NEXT: v_floor_f32_e32 v6, v6 @@ -542,7 +541,7 @@ ; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 ; VI-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc ; VI-NEXT: v_ashrrev_i32_e32 v4, 31, v5 -; VI-NEXT: v_trunc_f32_e32 v8, s2 +; VI-NEXT: v_trunc_f32_e32 v8, s6 ; VI-NEXT: v_xor_b32_e32 v5, v6, v4 ; VI-NEXT: v_mul_f32_e64 v6, |v8|, s8 ; VI-NEXT: v_floor_f32_e32 v6, v6 @@ -556,9 +555,10 @@ ; VI-NEXT: v_xor_b32_e32 v4, v10, v5 ; VI-NEXT: v_xor_b32_e32 v8, v9, v5 ; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v5 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_subb_u32_e32 v5, vcc, v8, v5, vcc -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: fp_to_sint_v4i64: @@ -748,14 +748,14 @@ ; ; VI-LABEL: fp_to_uint_f32_to_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_eq_f32_e64 s[0:1], -1.0, s0 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], -1.0, s4 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: fp_to_uint_f32_to_i1: @@ -798,14 +798,14 @@ ; ; VI-LABEL: fp_to_uint_fabs_f32_to_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_eq_f32_e64 s[0:1], -1.0, |s0| -; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], -1.0, |s4| +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: fp_to_uint_fabs_f32_to_i1: diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll --- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll @@ -168,8 +168,8 @@ ; VI-LABEL: fp_to_uint_f32_to_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s3, 0xcf800000 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xcf800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_trunc_f32_e32 v0, s2 ; VI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 @@ -263,8 +263,8 @@ ; VI-LABEL: fp_to_uint_v2f32_to_v2i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s4, 0x2f800000 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s4, 0x2f800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_trunc_f32_e32 v0, s3 ; VI-NEXT: v_trunc_f32_e32 v4, s2 @@ -412,39 +412,39 @@ ; ; VI-LABEL: fp_to_uint_v4f32_to_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s6, 0x2f800000 -; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s2, 0x2f800000 +; VI-NEXT: s_mov_b32 s3, 0xcf800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_trunc_f32_e32 v0, s1 -; VI-NEXT: v_trunc_f32_e32 v4, s0 -; VI-NEXT: v_mul_f32_e32 v1, s6, v0 -; VI-NEXT: v_mul_f32_e32 v2, s6, v4 +; VI-NEXT: v_trunc_f32_e32 v0, s5 +; VI-NEXT: v_trunc_f32_e32 v4, s4 +; VI-NEXT: v_mul_f32_e32 v1, s2, v0 +; VI-NEXT: v_mul_f32_e32 v2, s2, v4 ; VI-NEXT: v_floor_f32_e32 v5, v1 -; VI-NEXT: s_mov_b32 s0, 0xcf800000 ; VI-NEXT: v_floor_f32_e32 v6, v2 -; VI-NEXT: v_fma_f32 v0, v5, s0, v0 +; VI-NEXT: v_fma_f32 v0, v5, s3, v0 ; VI-NEXT: v_cvt_u32_f32_e32 v2, v0 -; VI-NEXT: v_fma_f32 v0, v6, s0, v4 -; VI-NEXT: v_trunc_f32_e32 v4, s3 +; VI-NEXT: v_fma_f32 v0, v6, s3, v4 +; VI-NEXT: v_trunc_f32_e32 v4, s7 ; VI-NEXT: v_cvt_u32_f32_e32 v3, v5 -; VI-NEXT: v_mul_f32_e32 v5, s6, v4 -; VI-NEXT: v_trunc_f32_e32 v8, s2 +; VI-NEXT: v_mul_f32_e32 v5, s2, v4 +; VI-NEXT: v_trunc_f32_e32 v8, s6 ; VI-NEXT: v_cvt_u32_f32_e32 v1, v6 ; VI-NEXT: v_floor_f32_e32 v6, v5 -; VI-NEXT: v_mul_f32_e32 v5, s6, v8 +; VI-NEXT: v_mul_f32_e32 v5, s2, v8 ; VI-NEXT: v_floor_f32_e32 v9, v5 -; VI-NEXT: v_fma_f32 v4, v6, s0, v4 +; VI-NEXT: v_fma_f32 v4, v6, s3, v4 ; VI-NEXT: v_cvt_u32_f32_e32 v7, v6 ; VI-NEXT: v_cvt_u32_f32_e32 v6, v4 -; VI-NEXT: v_fma_f32 v4, v9, s0, v8 +; VI-NEXT: v_fma_f32 v4, v9, s3, v8 ; VI-NEXT: v_cvt_u32_f32_e32 v5, v9 ; VI-NEXT: v_cvt_u32_f32_e32 v4, v4 ; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: fp_to_uint_v4f32_to_v4i64: @@ -634,14 +634,14 @@ ; ; VI-LABEL: fp_to_uint_f32_to_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_eq_f32_e64 s[0:1], 1.0, s0 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], 1.0, s4 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: fp_to_uint_f32_to_i1: @@ -684,14 +684,14 @@ ; ; VI-LABEL: fp_to_uint_fabs_f32_to_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_eq_f32_e64 s[0:1], 1.0, |s0| -; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], 1.0, |s4| +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: fp_to_uint_fabs_f32_to_i1: diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll @@ -138,7 +138,7 @@ ; SI: v_cmp_eq_f32_e32 vcc, -1.0, v{{[0-9]+}} ; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc ; VI: v_cmp_eq_f16_e64 s{{\[[0-9]+:[0-9]+\]}}, 0xbc00, s{{[0-9]+}} -; VI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, s[0:1] +; VI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, s[4:5] define amdgpu_kernel void @fptosi_f16_to_i1(i1 addrspace(1)* %out, half %in) { entry: %conv = fptosi half %in to i1 diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll @@ -134,7 +134,7 @@ ; SI: v_cmp_eq_f32_e32 vcc, 1.0, v{{[0-9]+}} ; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc ; VI: v_cmp_eq_f16_e64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, s{{[0-9]+}} -; VI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, s[0:1] +; VI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, s[4:5] define amdgpu_kernel void @fptoui_f16_to_i1(i1 addrspace(1)* %out, half %in) { entry: %conv = fptoui half %in to i1 diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll --- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -163,12 +163,12 @@ ; GCN: s_and_saveexec_b64 -; CI: v_add_i32_e32 [[GEP:v[0-9]+]], vcc, 4, [[SHIFT]] ; CI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 glc{{$}} +; CI: v_add_i32_e32 [[GEP:v[0-9]+]], vcc, 4, [[SHIFT]] -; GFX9: v_add_u32_e32 [[GEP:v[0-9]+]], 4, [[SP]] ; GFX9-MUBUF: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 glc{{$}} ; GFX9-FLATSCR: scratch_load_dword v{{[0-9]+}}, off, s32 offset:4 glc{{$}} +; GFX9: v_add_u32_e32 [[GEP:v[0-9]+]], 4, [[SP]] ; GCN: ds_write_b32 v{{[0-9]+}}, [[GEP]] define void @void_func_byval_struct_i8_i32_ptr_nonentry_block({ i8, i32 } addrspace(5)* byval({ i8, i32 }) %arg0, i32 %arg2) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -12,15 +12,15 @@ define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %z) { ; SI-LABEL: fshl_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dword s0, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: s_not_b32 s0, s0 ; SI-NEXT: v_alignbit_b32 v0, s2, v0, 1 +; SI-NEXT: s_not_b32 s0, s0 ; SI-NEXT: s_lshr_b32 s1, s2, 1 ; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: v_alignbit_b32 v0, s1, v0, v1 @@ -29,35 +29,35 @@ ; ; VI-LABEL: fshl_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: s_not_b32 s0, s0 -; VI-NEXT: s_lshr_b32 s1, s4, 1 -; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_alignbit_b32 v2, s1, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: s_not_b32 s4, s4 +; VI-NEXT: s_lshr_b32 s3, s2, 1 +; VI-NEXT: v_alignbit_b32 v0, s2, v0, 1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_alignbit_b32 v2, s3, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshl_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshr_b32 s0, s2, 1 ; GFX9-NEXT: s_not_b32 s1, s6 -; GFX9-NEXT: s_lshr_b32 s0, s4, 1 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 1 +; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 1 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_i32: @@ -97,37 +97,37 @@ define amdgpu_kernel void @fshl_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) { ; SI-LABEL: fshl_i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_alignbit_b32 v0, s0, v0, 25 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_alignbit_b32 v0, s4, v0, 25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_alignbit_b32 v2, s0, v0, 25 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_alignbit_b32 v2, s2, v0, 25 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshl_i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 25 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 25 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_i32_imm: @@ -161,23 +161,23 @@ define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { ; SI-LABEL: fshl_v2i32: ; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf ; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s9 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_alignbit_b32 v0, s9, v0, 1 ; SI-NEXT: s_not_b32 s1, s1 -; SI-NEXT: v_alignbit_b32 v0, s3, v0, 1 -; SI-NEXT: s_lshr_b32 s3, s3, 1 +; SI-NEXT: s_lshr_b32 s3, s9, 1 ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: v_alignbit_b32 v1, s3, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: s_not_b32 s0, s0 -; SI-NEXT: v_alignbit_b32 v0, s2, v0, 1 -; SI-NEXT: s_lshr_b32 s1, s2, 1 +; SI-NEXT: v_alignbit_b32 v0, s8, v0, 1 +; SI-NEXT: s_lshr_b32 s1, s8, 1 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: v_alignbit_b32 v0, s1, v0, v2 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -185,49 +185,49 @@ ; ; VI-LABEL: fshl_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: s_not_b32 s1, s1 -; VI-NEXT: s_lshr_b32 s7, s5, 1 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: s_lshr_b32 s3, s5, 1 ; VI-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_alignbit_b32 v1, s7, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_not_b32 s0, s0 +; VI-NEXT: s_not_b32 s5, s7 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_alignbit_b32 v1, s3, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_not_b32 s3, s6 ; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; VI-NEXT: s_lshr_b32 s1, s4, 1 +; VI-NEXT: s_lshr_b32 s2, s4, 1 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_alignbit_b32 v0, s2, v0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_alignbit_b32 v0, s1, v0, v2 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshl_v2i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x3c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 ; GFX9-NEXT: s_lshr_b32 s0, s5, 1 -; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: s_not_b32 s1, s9 -; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_alignbit_b32 v1, s0, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_not_b32 s1, s8 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 ; GFX9-NEXT: s_lshr_b32 s0, s4, 1 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v2i32: @@ -276,46 +276,46 @@ define amdgpu_kernel void @fshl_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) { ; SI-LABEL: fshl_v2i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_alignbit_b32 v1, s3, v0, 23 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_alignbit_b32 v0, s2, v0, 25 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_alignbit_b32 v1, s7, v0, 23 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_alignbit_b32 v0, s6, v0, 25 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_v2i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_alignbit_b32 v1, s5, v0, 23 ; VI-NEXT: v_alignbit_b32 v0, s4, v2, 25 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshl_v2i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 23 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v3, 25 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v2i32_imm: @@ -353,106 +353,106 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; SI-LABEL: fshl_v4i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x15 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s15 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: v_alignbit_b32 v0, s15, v0, 1 ; SI-NEXT: s_not_b32 s3, s3 -; SI-NEXT: v_alignbit_b32 v0, s11, v0, 1 -; SI-NEXT: s_lshr_b32 s11, s11, 1 +; SI-NEXT: s_lshr_b32 s7, s15, 1 ; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_alignbit_b32 v3, s11, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s14 +; SI-NEXT: v_alignbit_b32 v3, s7, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: s_not_b32 s2, s2 -; SI-NEXT: v_alignbit_b32 v0, s10, v0, 1 -; SI-NEXT: s_lshr_b32 s3, s10, 1 +; SI-NEXT: v_alignbit_b32 v0, s14, v0, 1 +; SI-NEXT: s_lshr_b32 s3, s14, 1 ; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: v_alignbit_b32 v2, s3, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s13 +; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: s_not_b32 s1, s1 -; SI-NEXT: v_alignbit_b32 v0, s9, v0, 1 -; SI-NEXT: s_lshr_b32 s2, s9, 1 +; SI-NEXT: v_alignbit_b32 v0, s13, v0, 1 +; SI-NEXT: s_lshr_b32 s2, s13, 1 ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: v_alignbit_b32 v1, s2, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_not_b32 s0, s0 -; SI-NEXT: v_alignbit_b32 v0, s8, v0, 1 -; SI-NEXT: s_lshr_b32 s1, s8, 1 +; SI-NEXT: v_alignbit_b32 v0, s12, v0, 1 +; SI-NEXT: s_lshr_b32 s1, s12, 1 ; SI-NEXT: v_mov_b32_e32 v4, s0 ; SI-NEXT: v_alignbit_b32 v0, s1, v0, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_v4i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s11 -; VI-NEXT: s_not_b32 s3, s3 -; VI-NEXT: s_lshr_b32 s11, s7, 1 -; VI-NEXT: v_alignbit_b32 v0, s7, v0, 1 +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: s_lshr_b32 s2, s11, 1 +; VI-NEXT: s_not_b32 s3, s15 +; VI-NEXT: v_alignbit_b32 v0, s11, v0, 1 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_alignbit_b32 v3, s2, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_not_b32 s3, s14 +; VI-NEXT: v_alignbit_b32 v0, s10, v0, 1 +; VI-NEXT: s_lshr_b32 s2, s10, 1 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_alignbit_b32 v2, s2, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: s_not_b32 s3, s13 +; VI-NEXT: v_alignbit_b32 v0, s9, v0, 1 +; VI-NEXT: s_lshr_b32 s2, s9, 1 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_alignbit_b32 v3, s11, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s10 -; VI-NEXT: s_not_b32 s2, s2 -; VI-NEXT: v_alignbit_b32 v0, s6, v0, 1 -; VI-NEXT: s_lshr_b32 s3, s6, 1 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_alignbit_b32 v2, s3, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: s_not_b32 s1, s1 -; VI-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; VI-NEXT: s_lshr_b32 s2, s5, 1 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_alignbit_b32 v1, s2, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: s_not_b32 s0, s0 -; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; VI-NEXT: s_lshr_b32 s1, s4, 1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_not_b32 s3, s12 +; VI-NEXT: v_alignbit_b32 v0, s8, v0, 1 +; VI-NEXT: s_lshr_b32 s2, s8, 1 +; VI-NEXT: v_mov_b32_e32 v4, s3 +; VI-NEXT: v_alignbit_b32 v0, s2, v0, v4 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_alignbit_b32 v0, s1, v0, v4 -; VI-NEXT: v_mov_b32_e32 v4, s12 -; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshl_v4i32: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 ; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s7, 1 -; GFX9-NEXT: v_mov_b32_e32 v0, s11 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_alignbit_b32 v0, s11, v0, 1 +; GFX9-NEXT: s_lshr_b32 s0, s11, 1 ; GFX9-NEXT: s_not_b32 s1, s15 -; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_alignbit_b32 v3, s0, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s10 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: s_not_b32 s1, s14 -; GFX9-NEXT: v_alignbit_b32 v0, s6, v0, 1 -; GFX9-NEXT: s_lshr_b32 s0, s6, 1 +; GFX9-NEXT: v_alignbit_b32 v0, s10, v0, 1 +; GFX9-NEXT: s_lshr_b32 s0, s10, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_alignbit_b32 v2, s0, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: s_not_b32 s1, s13 -; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; GFX9-NEXT: s_lshr_b32 s0, s5, 1 +; GFX9-NEXT: v_alignbit_b32 v0, s9, v0, 1 +; GFX9-NEXT: s_lshr_b32 s0, s9, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_alignbit_b32 v1, s0, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_not_b32 s1, s12 -; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; GFX9-NEXT: s_lshr_b32 s0, s4, 1 +; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, 1 +; GFX9-NEXT: s_lshr_b32 s0, s8, 1 ; GFX9-NEXT: v_mov_b32_e32 v5, s1 ; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v5 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] @@ -520,57 +520,57 @@ define amdgpu_kernel void @fshl_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) { ; SI-LABEL: fshl_v4i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: v_alignbit_b32 v3, s11, v0, 31 -; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: v_alignbit_b32 v2, s10, v0, 23 -; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_alignbit_b32 v1, s9, v0, 25 -; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_alignbit_b32 v0, s8, v0, 31 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_v4i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_mov_b32_e32 v5, s9 -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_alignbit_b32 v3, s7, v0, 31 -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_alignbit_b32 v2, s6, v1, 23 -; VI-NEXT: v_alignbit_b32 v1, s5, v0, 25 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_alignbit_b32 v0, s4, v0, 31 +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_alignbit_b32 v3, s11, v0, 31 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_alignbit_b32 v2, s10, v1, 23 +; VI-NEXT: v_alignbit_b32 v1, s9, v0, 25 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_alignbit_b32 v0, s8, v0, 31 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshl_v4i32_imm: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s11 -; GFX9-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, 31 -; GFX9-NEXT: v_mov_b32_e32 v0, s9 -; GFX9-NEXT: v_alignbit_b32 v2, s6, v1, 23 -; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 25 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 31 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_alignbit_b32 v3, s11, v0, 31 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_alignbit_b32 v2, s10, v1, 23 +; GFX9-NEXT: v_alignbit_b32 v1, s9, v0, 25 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, 31 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -21,43 +21,43 @@ define amdgpu_kernel void @fshr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %z) { ; SI-LABEL: fshr_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; SI-NEXT: s_load_dword s0, s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dword s6, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: v_alignbit_b32 v0, s2, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_alignbit_b32 v0, s4, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_alignbit_b32 v2, s4, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_alignbit_b32 v2, s2, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshr_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_i32: @@ -92,37 +92,37 @@ define amdgpu_kernel void @fshr_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) { ; SI-LABEL: fshr_i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_alignbit_b32 v0, s0, v0, 7 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_alignbit_b32 v0, s4, v0, 7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_alignbit_b32 v2, s0, v0, 7 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_alignbit_b32 v2, s2, v0, 7 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshr_i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 7 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 7 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_i32_imm: @@ -156,55 +156,55 @@ define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { ; SI-LABEL: fshr_v2i32: ; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_alignbit_b32 v1, s3, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_mov_b32_e32 v2, s0 -; SI-NEXT: v_alignbit_b32 v0, s2, v0, v2 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_alignbit_b32 v1, s1, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_alignbit_b32 v0, s0, v0, v2 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_alignbit_b32 v1, s7, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_alignbit_b32 v0, s6, v0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_alignbit_b32 v0, s4, v0, v2 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshr_v2i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s8 -; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_alignbit_b32 v1, s7, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_alignbit_b32 v0, s6, v0, v3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_v2i32: @@ -245,46 +245,46 @@ define amdgpu_kernel void @fshr_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) { ; SI-LABEL: fshr_v2i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_alignbit_b32 v1, s3, v0, 9 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_alignbit_b32 v0, s2, v0, 7 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_alignbit_b32 v1, s7, v0, 9 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_alignbit_b32 v0, s6, v0, 7 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_v2i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_alignbit_b32 v1, s5, v0, 9 ; VI-NEXT: v_alignbit_b32 v0, s4, v2, 7 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshr_v2i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 9 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v3, 7 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_v2i32_imm: @@ -322,72 +322,72 @@ define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; SI-LABEL: fshr_v4i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x11 -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x15 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x15 +; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s15, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s15 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_alignbit_b32 v3, s11, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s14 -; SI-NEXT: v_mov_b32_e32 v1, s2 -; SI-NEXT: v_alignbit_b32 v2, s10, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s13 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_alignbit_b32 v1, s9, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: v_mov_b32_e32 v4, s0 -; SI-NEXT: v_alignbit_b32 v0, s8, v0, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_alignbit_b32 v3, s3, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_alignbit_b32 v2, s2, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_alignbit_b32 v1, s1, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: v_alignbit_b32 v0, s0, v0, v4 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_v4i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x54 +; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s11 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_alignbit_b32 v3, s7, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s10 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_alignbit_b32 v2, s6, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v1, s11 +; VI-NEXT: v_alignbit_b32 v3, s15, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s10 +; VI-NEXT: v_alignbit_b32 v2, s14, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_alignbit_b32 v1, s13, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v4, s8 +; VI-NEXT: v_alignbit_b32 v0, s12, v0, v4 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_alignbit_b32 v0, s4, v0, v4 -; VI-NEXT: v_mov_b32_e32 v4, s12 -; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshr_v4i32: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x54 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 -; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s11 -; GFX9-NEXT: v_mov_b32_e32 v1, s15 -; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-NEXT: v_alignbit_b32 v2, s6, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s9 -; GFX9-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v5, s12 -; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v5 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-NEXT: v_alignbit_b32 v3, s15, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: v_alignbit_b32 v2, s14, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_alignbit_b32 v1, s13, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-NEXT: v_alignbit_b32 v0, s12, v0, v5 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm ; @@ -437,57 +437,57 @@ define amdgpu_kernel void @fshr_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) { ; SI-LABEL: fshr_v4i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: v_alignbit_b32 v3, s11, v0, 1 -; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: v_alignbit_b32 v2, s10, v0, 9 -; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_alignbit_b32 v1, s9, v0, 7 -; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_alignbit_b32 v0, s8, v0, 1 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_v4i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_mov_b32_e32 v5, s9 -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_alignbit_b32 v3, s7, v0, 1 -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_alignbit_b32 v2, s6, v1, 9 -; VI-NEXT: v_alignbit_b32 v1, s5, v0, 7 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1 +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_alignbit_b32 v3, s11, v0, 1 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_alignbit_b32 v2, s10, v1, 9 +; VI-NEXT: v_alignbit_b32 v1, s9, v0, 7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_alignbit_b32 v0, s8, v0, 1 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshr_v4i32_imm: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s11 -; GFX9-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v0, s9 -; GFX9-NEXT: v_alignbit_b32 v2, s6, v1, 9 -; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 7 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_alignbit_b32 v3, s11, v0, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_alignbit_b32 v2, s10, v1, 9 +; GFX9-NEXT: v_alignbit_b32 v1, s9, v0, 7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, 1 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll --- a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll +++ b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll @@ -95,11 +95,12 @@ ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 5, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx2 v[4:5], v6, s[0:1] offset:16 ; GCN-NEXT: global_load_dwordx4 v[0:3], v6, s[0:1] -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: global_load_dwordx2 v[4:5], v6, s[0:1] offset:16 +; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_and_b32_e32 v1, v3, v1 ; GCN-NEXT: v_and_b32_e32 v0, v2, v0 +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_and_b32_e32 v1, v1, v5 ; GCN-NEXT: v_and_b32_e32 v0, v0, v4 ; GCN-NEXT: v_not_b32_e32 v0, v0 @@ -156,11 +157,12 @@ ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 5, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx2 v[4:5], v6, s[0:1] offset:16 ; GCN-NEXT: global_load_dwordx4 v[0:3], v6, s[0:1] -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: global_load_dwordx2 v[4:5], v6, s[0:1] offset:16 +; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_xor_b32_e32 v1, v3, v1 ; GCN-NEXT: v_xor_b32_e32 v0, v2, v0 +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_xnor_b32_e32 v0, v0, v4 ; GCN-NEXT: v_xnor_b32_e32 v1, v1, v5 ; GCN-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -4093,8 +4093,8 @@ ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v28, 0 +; GFX9-NEXT: global_load_dword v32, v[0:1], off ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v28, s[30:31] ; GFX9-NEXT: global_load_dwordx4 v[4:7], v28, s[30:31] offset:16 @@ -4105,12 +4105,11 @@ ; GFX9-NEXT: global_load_dwordx4 v[24:27], v28, s[30:31] offset:96 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dwordx4 v[28:31], v28, s[30:31] offset:112 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[30:31] ; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v32i32_i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v32i32_i32@rel32@hi+12 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: global_load_dword v32, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 @@ -4346,8 +4345,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v2, s[30:31] offset:4 ; GFX9-NEXT: global_load_ubyte v0, v2, s[30:31] +; GFX9-NEXT: global_load_dword v1, v2, s[30:31] offset:4 ; GFX9-NEXT: s_getpc_b64 s[30:31] ; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_struct_i8_i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_struct_i8_i32@rel32@hi+12 @@ -4850,8 +4849,8 @@ ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 30 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:8 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:12 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:12 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:8 ; GFX9-NEXT: v_writelane_b32 v40, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s35, 1 ; GFX9-NEXT: v_writelane_b32 v40, s36, 2 @@ -4887,9 +4886,10 @@ ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, byval_align16_f64_arg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s7, s7, byval_align16_f64_arg@rel32@hi+12 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-NEXT: v_readlane_b32 s63, v40, 29 ; GFX9-NEXT: v_readlane_b32 s62, v40, 28 @@ -5850,10 +5850,10 @@ ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_mov_b32 s8, 1 ; GFX9-NEXT: s_mov_b32 s9, 2 -; GFX9-NEXT: s_getpc_b64 s[34:35] -; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3i64_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s35, s35, external_void_func_v3i64_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v3i64_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v3i64_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] ; GFX9-NEXT: v_readlane_b32 s30, v40, 6 ; GFX9-NEXT: v_readlane_b32 s31, v40, 7 ; GFX9-NEXT: v_readlane_b32 s9, v40, 5 @@ -5990,10 +5990,10 @@ ; GFX9-NEXT: s_mov_b32 s9, 2 ; GFX9-NEXT: s_mov_b32 s10, 3 ; GFX9-NEXT: s_mov_b32 s11, 4 -; GFX9-NEXT: s_getpc_b64 s[34:35] -; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v4i64_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s35, s35, external_void_func_v4i64_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v4i64_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v4i64_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] ; GFX9-NEXT: v_readlane_b32 s30, v40, 8 ; GFX9-NEXT: v_readlane_b32 s31, v40, 9 ; GFX9-NEXT: v_readlane_b32 s11, v40, 7 @@ -8637,12 +8637,12 @@ ; GFX9-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[34:35] -; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v8i32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s35, s35, external_void_func_v8i32_inreg@rel32@hi+12 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[30:31], 0x0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v8i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v8i32_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] ; GFX9-NEXT: v_readlane_b32 s30, v40, 8 ; GFX9-NEXT: v_readlane_b32 s31, v40, 9 ; GFX9-NEXT: v_readlane_b32 s11, v40, 7 @@ -8946,12 +8946,12 @@ ; GFX9-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[34:35] -; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v16i32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s35, s35, external_void_func_v16i32_inreg@rel32@hi+12 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx16 s[4:19], s[30:31], 0x0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v16i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v16i32_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] ; GFX9-NEXT: v_readlane_b32 s30, v40, 16 ; GFX9-NEXT: v_readlane_b32 s31, v40, 17 ; GFX9-NEXT: v_readlane_b32 s19, v40, 15 @@ -9149,8 +9149,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx16 s[4:19], s[30:31], 0x0 ; GFX9-NEXT: s_load_dwordx16 s[36:51], s[30:31], 0x40 +; GFX9-NEXT: s_load_dwordx16 s[4:19], s[30:31], 0x0 ; GFX9-NEXT: s_getpc_b64 s[30:31] ; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v32i32_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v32i32_inreg@rel32@hi+12 @@ -9474,8 +9474,8 @@ ; GFX9-NEXT: s_load_dword s34, s[30:31], 0x0 ; GFX9-NEXT: ; kill: killed $sgpr30_sgpr31 ; GFX9-NEXT: ; kill: killed $sgpr30_sgpr31 -; GFX9-NEXT: s_load_dwordx16 s[4:19], s[30:31], 0x0 ; GFX9-NEXT: s_load_dwordx16 s[36:51], s[30:31], 0x40 +; GFX9-NEXT: s_load_dwordx16 s[4:19], s[30:31], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s34 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -159,13 +159,12 @@ define amdgpu_kernel void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 { ; SI-LABEL: extload_v2f16_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[4:5], 0x2 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s1, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s1 +; SI-NEXT: s_load_dword s2, s[4:5], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshr_b32 s3, s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: v_mov_b32_e32 v3, s1 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -173,13 +172,12 @@ ; ; VI-LABEL: extload_v2f16_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[4:5], 0x8 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s1, s0, 16 -; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; VI-NEXT: v_cvt_f32_f16_e32 v1, s1 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s3, s2, 16 +; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -192,11 +190,10 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 { ; SI-LABEL: extload_f16_to_f32_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[4:5], 0x2 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, s0 +; SI-NEXT: s_load_dword s2, s[4:5], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 ; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: flat_store_dword v[0:1], v2 @@ -204,11 +201,10 @@ ; ; VI-LABEL: extload_f16_to_f32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[4:5], 0x8 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v2, s0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -221,13 +217,12 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 { ; SI-LABEL: extload_v2f16_to_v2f32_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[4:5], 0x2 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s1, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s1 +; SI-NEXT: s_load_dword s2, s[4:5], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshr_b32 s3, s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: v_mov_b32_e32 v3, s1 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -235,13 +230,12 @@ ; ; VI-LABEL: extload_v2f16_to_v2f32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[4:5], 0x8 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s1, s0, 16 -; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; VI-NEXT: v_cvt_f32_f16_e32 v1, s1 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s3, s2, 16 +; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -255,30 +249,28 @@ ; SI-LABEL: extload_v3f16_to_v3f32_arg: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s2, s0, 16 +; SI-NEXT: s_lshr_b32 s4, s0, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, s1 -; SI-NEXT: v_mov_b32_e32 v3, s0 +; SI-NEXT: v_mov_b32_e32 v4, s3 +; SI-NEXT: v_mov_b32_e32 v3, s2 ; SI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; SI-NEXT: s_endpgm ; ; VI-LABEL: extload_v3f16_to_v3f32_arg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s2, s0, 16 +; VI-NEXT: s_lshr_b32 s4, s0, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s1 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_cvt_f32_f16_e32 v1, s2 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v4, s3 +; VI-NEXT: v_mov_b32_e32 v3, s2 ; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-NEXT: s_endpgm %ext = fpext <3 x half> %arg to <3 x float> @@ -290,34 +282,32 @@ ; SI-LABEL: extload_v4f16_to_v4f32_arg: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s2, s1, 16 -; SI-NEXT: s_lshr_b32 s3, s0, 16 +; SI-NEXT: s_lshr_b32 s4, s1, 16 +; SI-NEXT: s_lshr_b32 s5, s0, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s3 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, s1 -; SI-NEXT: v_mov_b32_e32 v4, s0 +; SI-NEXT: v_mov_b32_e32 v5, s3 +; SI-NEXT: v_mov_b32_e32 v4, s2 ; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; SI-NEXT: s_endpgm ; ; VI-LABEL: extload_v4f16_to_v4f32_arg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s2, s1, 16 -; VI-NEXT: s_lshr_b32 s3, s0, 16 +; VI-NEXT: s_lshr_b32 s4, s1, 16 +; VI-NEXT: s_lshr_b32 s5, s0, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; VI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; VI-NEXT: v_cvt_f32_f16_e32 v1, s5 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s1 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_cvt_f32_f16_e32 v3, s2 -; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm %ext = fpext <4 x half> %arg to <4 x float> @@ -328,57 +318,57 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 { ; SI-LABEL: extload_v8f16_to_v8f32_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s1, 16 -; SI-NEXT: s_lshr_b32 s5, s0, 16 +; SI-NEXT: s_lshr_b32 s6, s1, 16 +; SI-NEXT: s_lshr_b32 s7, s0, 16 ; SI-NEXT: s_lshr_b32 s8, s3, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: s_lshr_b32 s6, s2, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s2 -; SI-NEXT: s_add_u32 s0, s6, 16 +; SI-NEXT: s_add_u32 s0, s4, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s1 -; SI-NEXT: s_addc_u32 s1, s7, 0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: s_addc_u32 s1, s5, 0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 ; SI-NEXT: v_mov_b32_e32 v9, s1 ; SI-NEXT: v_mov_b32_e32 v8, s0 ; SI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; SI-NEXT: s_nop 0 -; SI-NEXT: v_mov_b32_e32 v4, s6 -; SI-NEXT: v_mov_b32_e32 v5, s7 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v5, s5 ; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; SI-NEXT: s_endpgm ; ; VI-LABEL: extload_v8f16_to_v8f32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s4, s1, 16 -; VI-NEXT: s_lshr_b32 s5, s0, 16 +; VI-NEXT: s_lshr_b32 s6, s1, 16 +; VI-NEXT: s_lshr_b32 s7, s0, 16 ; VI-NEXT: s_lshr_b32 s8, s3, 16 -; VI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; VI-NEXT: s_lshr_b32 s4, s2, 16 +; VI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; VI-NEXT: s_lshr_b32 s6, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v7, s8 -; VI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; VI-NEXT: v_cvt_f32_f16_e32 v5, s6 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; VI-NEXT: v_cvt_f32_f16_e32 v6, s3 ; VI-NEXT: v_cvt_f32_f16_e32 v4, s2 -; VI-NEXT: s_add_u32 s0, s6, 16 +; VI-NEXT: s_add_u32 s0, s4, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s1 -; VI-NEXT: s_addc_u32 s1, s7, 0 -; VI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v9, s1 ; VI-NEXT: v_mov_b32_e32 v8, s0 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; VI-NEXT: s_nop 0 -; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm %ext = fpext <8 x half> %arg to <8 x float> @@ -423,11 +413,11 @@ ; SI-NEXT: s_load_dword s0, s[4:5], 0x2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s1, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, s1 ; SI-NEXT: v_mov_b32_e32 v4, s0 @@ -439,11 +429,11 @@ ; VI-NEXT: s_load_dword s0, s[4:5], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s1, s0, 16 -; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; VI-NEXT: v_cvt_f32_f16_e32 v2, s1 +; VI-NEXT: v_cvt_f32_f16_e32 v0, s1 +; VI-NEXT: v_cvt_f32_f16_e32 v1, s0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 +; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 @@ -457,45 +447,45 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 { ; SI-LABEL: extload_v3f16_to_v3f64_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 -; SI-NEXT: s_lshr_b32 s4, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 +; SI-NEXT: s_lshr_b32 s4, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_add_u32 s2, s0, 16 +; SI-NEXT: s_add_u32 s0, s2, 16 ; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v0 -; SI-NEXT: s_addc_u32 s3, s1, 0 +; SI-NEXT: s_addc_u32 s1, s3, 0 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 ; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; SI-NEXT: v_mov_b32_e32 v7, s3 -; SI-NEXT: v_mov_b32_e32 v6, s2 +; SI-NEXT: v_mov_b32_e32 v7, s1 +; SI-NEXT: v_mov_b32_e32 v6, s0 ; SI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] -; SI-NEXT: v_mov_b32_e32 v5, s1 -; SI-NEXT: v_mov_b32_e32 v4, s0 +; SI-NEXT: v_mov_b32_e32 v5, s3 +; SI-NEXT: v_mov_b32_e32 v4, s2 ; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; SI-NEXT: s_endpgm ; ; VI-LABEL: extload_v3f16_to_v3f64_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 -; VI-NEXT: s_lshr_b32 s4, s2, 16 -; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; VI-NEXT: v_cvt_f32_f16_e32 v1, s1 +; VI-NEXT: s_lshr_b32 s4, s0, 16 +; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; VI-NEXT: s_add_u32 s2, s0, 16 +; VI-NEXT: s_add_u32 s0, s2, 16 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v1 -; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; VI-NEXT: v_mov_b32_e32 v7, s3 -; VI-NEXT: v_mov_b32_e32 v6, s2 +; VI-NEXT: v_mov_b32_e32 v7, s1 +; VI-NEXT: v_mov_b32_e32 v6, s0 ; VI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm %ext = fpext <3 x half> %arg to <3 x double> @@ -506,53 +496,53 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 { ; SI-LABEL: extload_v4f16_to_v4f64_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s3, 16 +; SI-NEXT: s_lshr_b32 s4, s1, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s3 -; SI-NEXT: s_lshr_b32 s5, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s1 +; SI-NEXT: s_lshr_b32 s5, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 ; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 ; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 -; SI-NEXT: s_add_u32 s2, s0, 16 -; SI-NEXT: s_addc_u32 s3, s1, 0 +; SI-NEXT: s_add_u32 s0, s2, 16 +; SI-NEXT: s_addc_u32 s1, s3, 0 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; SI-NEXT: v_mov_b32_e32 v9, s3 -; SI-NEXT: v_mov_b32_e32 v8, s2 +; SI-NEXT: v_mov_b32_e32 v9, s1 +; SI-NEXT: v_mov_b32_e32 v8, s0 ; SI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; SI-NEXT: s_nop 0 -; SI-NEXT: v_mov_b32_e32 v5, s1 -; SI-NEXT: v_mov_b32_e32 v4, s0 +; SI-NEXT: v_mov_b32_e32 v5, s3 +; SI-NEXT: v_mov_b32_e32 v4, s2 ; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; SI-NEXT: s_endpgm ; ; VI-LABEL: extload_v4f16_to_v4f64_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s5, s3, 16 +; VI-NEXT: s_lshr_b32 s5, s1, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v4, s5 -; VI-NEXT: v_cvt_f32_f16_e32 v5, s3 -; VI-NEXT: s_lshr_b32 s4, s2, 16 -; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; VI-NEXT: v_cvt_f32_f16_e32 v5, s1 +; VI-NEXT: s_lshr_b32 s4, s0, 16 +; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 -; VI-NEXT: s_add_u32 s2, s0, 16 -; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: s_add_u32 s0, s2, 16 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; VI-NEXT: v_mov_b32_e32 v9, s3 -; VI-NEXT: v_mov_b32_e32 v8, s2 +; VI-NEXT: v_mov_b32_e32 v9, s1 +; VI-NEXT: v_mov_b32_e32 v8, s0 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; VI-NEXT: s_nop 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm %ext = fpext <4 x half> %arg to <4 x double> @@ -563,39 +553,39 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 { ; SI-LABEL: extload_v8f16_to_v8f64_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s3, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s6, s3, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s3 -; SI-NEXT: s_lshr_b32 s5, s2, 16 +; SI-NEXT: s_lshr_b32 s7, s2, 16 ; SI-NEXT: s_lshr_b32 s8, s1, 16 -; SI-NEXT: s_lshr_b32 s4, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: s_lshr_b32 s6, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s2 ; SI-NEXT: v_cvt_f32_f16_e32 v9, s0 -; SI-NEXT: s_add_u32 s0, s6, 48 +; SI-NEXT: s_add_u32 s0, s4, 48 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s1 ; SI-NEXT: v_cvt_f64_f32_e32 v[14:15], v0 ; SI-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 -; SI-NEXT: s_addc_u32 s1, s7, 0 +; SI-NEXT: s_addc_u32 s1, s5, 0 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v17, s1 ; SI-NEXT: v_mov_b32_e32 v16, s0 -; SI-NEXT: s_add_u32 s0, s6, 32 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_add_u32 s0, s4, 32 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 ; SI-NEXT: v_cvt_f64_f32_e32 v[10:11], v1 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 ; SI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; SI-NEXT: s_addc_u32 s1, s7, 0 +; SI-NEXT: s_addc_u32 s1, s5, 0 ; SI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 ; SI-NEXT: v_mov_b32_e32 v13, s1 ; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 ; SI-NEXT: v_mov_b32_e32 v12, s0 -; SI-NEXT: s_add_u32 s0, s6, 16 -; SI-NEXT: s_addc_u32 s1, s7, 0 +; SI-NEXT: s_add_u32 s0, s4, 16 +; SI-NEXT: s_addc_u32 s1, s5, 0 ; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 ; SI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; SI-NEXT: s_nop 0 @@ -603,46 +593,46 @@ ; SI-NEXT: v_mov_b32_e32 v8, s0 ; SI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; SI-NEXT: s_nop 0 -; SI-NEXT: v_mov_b32_e32 v4, s6 -; SI-NEXT: v_mov_b32_e32 v5, s7 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v5, s5 ; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; SI-NEXT: s_endpgm ; ; VI-LABEL: extload_v8f16_to_v8f64_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s4, s0, 16 +; VI-NEXT: s_lshr_b32 s6, s0, 16 ; VI-NEXT: s_lshr_b32 s8, s2, 16 ; VI-NEXT: s_lshr_b32 s9, s3, 16 -; VI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; VI-NEXT: v_cvt_f32_f16_e32 v0, s6 ; VI-NEXT: v_cvt_f32_f16_e32 v4, s8 ; VI-NEXT: v_cvt_f32_f16_e32 v5, s9 ; VI-NEXT: v_cvt_f32_f16_e32 v12, s3 -; VI-NEXT: s_lshr_b32 s5, s1, 16 +; VI-NEXT: s_lshr_b32 s7, s1, 16 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; VI-NEXT: v_cvt_f32_f16_e32 v8, s2 -; VI-NEXT: s_add_u32 s0, s6, 48 +; VI-NEXT: s_add_u32 s0, s4, 48 ; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v4 ; VI-NEXT: v_cvt_f64_f32_e32 v[14:15], v5 ; VI-NEXT: v_cvt_f32_f16_e32 v4, s1 ; VI-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 -; VI-NEXT: s_addc_u32 s1, s7, 0 -; VI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v17, s1 ; VI-NEXT: v_mov_b32_e32 v16, s0 -; VI-NEXT: s_add_u32 s0, s6, 32 +; VI-NEXT: s_add_u32 s0, s4, 32 ; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; VI-NEXT: s_addc_u32 s1, s7, 0 +; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v1 ; VI-NEXT: v_mov_b32_e32 v13, s1 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 ; VI-NEXT: v_mov_b32_e32 v12, s0 -; VI-NEXT: s_add_u32 s0, s6, 16 -; VI-NEXT: s_addc_u32 s1, s7, 0 +; VI-NEXT: s_add_u32 s0, s4, 16 +; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; VI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-NEXT: s_nop 0 @@ -650,8 +640,8 @@ ; VI-NEXT: v_mov_b32_e32 v8, s0 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; VI-NEXT: s_nop 0 -; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm %ext = fpext <8 x half> %arg to <8 x double> @@ -664,9 +654,9 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: v_mov_b32_e32 v3, s3 -; GCN-NEXT: flat_load_ushort v2, v[2:3] +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: flat_load_ushort v2, v[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -682,9 +672,9 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: v_mov_b32_e32 v3, s3 -; GCN-NEXT: flat_load_dword v2, v[2:3] +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -960,13 +950,12 @@ ; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v6 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: flat_store_dwordx4 v[13:14], v[9:12] -; SI-NEXT: s_nop 0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 +; SI-NEXT: flat_store_dwordx4 v[13:14], v[9:12] ; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v2 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 @@ -997,11 +986,11 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: s_add_u32 s2, s2, 16 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 @@ -1126,9 +1115,9 @@ ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v1 -; SI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; SI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] ; SI-NEXT: v_mov_b32_e32 v5, s1 ; SI-NEXT: v_mov_b32_e32 v4, s0 ; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -1258,13 +1247,12 @@ ; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v11 +; SI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 ; SI-NEXT: flat_store_dwordx4 v[6:7], v[0:3] -; SI-NEXT: s_nop 0 +; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v10 -; SI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 ; SI-NEXT: v_cvt_f64_f32_e32 v[10:11], v16 ; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v17 -; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 ; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v18 ; SI-NEXT: v_mov_b32_e32 v17, s1 ; SI-NEXT: v_mov_b32_e32 v16, s0 @@ -1329,94 +1317,91 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; SI-NEXT: s_add_u32 s2, s2, 16 ; SI-NEXT: s_addc_u32 s3, s3, 0 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_mov_b32_e32 v3, s3 -; SI-NEXT: flat_load_dwordx4 v[4:7], v[0:1] -; SI-NEXT: flat_load_dwordx4 v[0:3], v[2:3] +; SI-NEXT: v_mov_b32_e32 v5, s3 +; SI-NEXT: v_mov_b32_e32 v4, s2 +; SI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; SI-NEXT: s_add_u32 s2, s0, 48 ; SI-NEXT: s_addc_u32 s3, s1, 0 -; SI-NEXT: v_mov_b32_e32 v14, s3 -; SI-NEXT: v_mov_b32_e32 v13, s2 +; SI-NEXT: v_mov_b32_e32 v15, s3 +; SI-NEXT: v_mov_b32_e32 v14, s2 ; SI-NEXT: s_add_u32 s2, s0, 32 ; SI-NEXT: s_addc_u32 s3, s1, 0 -; SI-NEXT: v_mov_b32_e32 v16, s3 -; SI-NEXT: v_mov_b32_e32 v15, s2 +; SI-NEXT: v_mov_b32_e32 v17, s3 +; SI-NEXT: v_mov_b32_e32 v16, s2 ; SI-NEXT: s_add_u32 s2, s0, 16 ; SI-NEXT: s_addc_u32 s3, s1, 0 -; SI-NEXT: v_mov_b32_e32 v18, s3 -; SI-NEXT: v_mov_b32_e32 v17, s2 +; SI-NEXT: v_mov_b32_e32 v19, s3 +; SI-NEXT: v_mov_b32_e32 v18, s2 ; SI-NEXT: s_add_u32 s2, s0, 0x70 ; SI-NEXT: s_addc_u32 s3, s1, 0 -; SI-NEXT: v_mov_b32_e32 v12, s1 -; SI-NEXT: v_mov_b32_e32 v11, s0 +; SI-NEXT: v_mov_b32_e32 v13, s1 +; SI-NEXT: v_mov_b32_e32 v12, s0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v3 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; SI-NEXT: v_cvt_f64_f32_e32 v[7:8], v7 -; SI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 -; SI-NEXT: flat_store_dwordx4 v[13:14], v[7:10] -; SI-NEXT: s_nop 0 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v7 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 -; SI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; SI-NEXT: flat_store_dwordx4 v[15:16], v[6:9] -; SI-NEXT: s_nop 0 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 -; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 +; SI-NEXT: v_cvt_f64_f32_e32 v[8:9], v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v5 +; SI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] +; SI-NEXT: v_mov_b32_e32 v15, s3 +; SI-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 +; SI-NEXT: v_cvt_f64_f32_e32 v[10:11], v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; SI-NEXT: v_mov_b32_e32 v14, s2 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: flat_store_dwordx4 v[17:18], v[4:7] -; SI-NEXT: s_nop 0 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 ; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 -; SI-NEXT: v_mov_b32_e32 v14, s3 -; SI-NEXT: v_mov_b32_e32 v13, s2 ; SI-NEXT: s_add_u32 s2, s0, 0x60 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 ; SI-NEXT: s_addc_u32 s3, s1, 0 -; SI-NEXT: flat_store_dwordx4 v[11:12], v[0:3] -; SI-NEXT: v_cvt_f32_f16_e32 v12, v5 -; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v19 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] +; SI-NEXT: v_mov_b32_e32 v17, s3 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 ; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 -; SI-NEXT: v_mov_b32_e32 v16, s3 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v20 -; SI-NEXT: v_mov_b32_e32 v15, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v5 +; SI-NEXT: v_mov_b32_e32 v16, s2 ; SI-NEXT: s_add_u32 s2, s0, 0x50 ; SI-NEXT: s_addc_u32 s3, s1, 0 ; SI-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 ; SI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 ; SI-NEXT: s_add_u32 s0, s0, 64 -; SI-NEXT: flat_store_dwordx4 v[13:14], v[0:3] +; SI-NEXT: flat_store_dwordx4 v[14:15], v[0:3] ; SI-NEXT: s_addc_u32 s1, s1, 0 -; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 -; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v12 -; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v21 -; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v19 -; SI-NEXT: v_mov_b32_e32 v18, s3 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v21 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v7 +; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v12 +; SI-NEXT: v_mov_b32_e32 v19, s3 ; SI-NEXT: v_mov_b32_e32 v13, s1 -; SI-NEXT: v_mov_b32_e32 v17, s2 +; SI-NEXT: v_mov_b32_e32 v18, s2 ; SI-NEXT: v_mov_b32_e32 v12, s0 -; SI-NEXT: flat_store_dwordx4 v[15:16], v[8:11] -; SI-NEXT: flat_store_dwordx4 v[17:18], v[0:3] +; SI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; SI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] ; SI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] ; SI-NEXT: s_endpgm ; @@ -1426,12 +1411,12 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; VI-NEXT: s_add_u32 s2, s2, 16 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: flat_load_dwordx4 v[4:7], v[0:1] -; VI-NEXT: flat_load_dwordx4 v[0:3], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: s_add_u32 s2, s0, 48 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v14, s3 @@ -1462,39 +1447,37 @@ ; VI-NEXT: v_mov_b32_e32 v14, s3 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 ; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; VI-NEXT: v_mov_b32_e32 v13, s2 +; VI-NEXT: s_add_u32 s2, s0, 0x60 +; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[15:16], v[6:9] -; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v16, s3 ; VI-NEXT: v_cvt_f32_f16_e32 v6, v5 ; VI-NEXT: v_cvt_f32_f16_sdwa v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_e32 v8, v4 ; VI-NEXT: v_cvt_f32_f16_sdwa v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 +; VI-NEXT: v_mov_b32_e32 v15, s2 +; VI-NEXT: s_add_u32 s2, s0, 0x50 +; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[17:18], v[4:7] -; VI-NEXT: s_nop 0 +; VI-NEXT: v_cvt_f32_f16_sdwa v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v8 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v9 ; VI-NEXT: v_cvt_f32_f16_sdwa v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_sdwa v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; VI-NEXT: v_mov_b32_e32 v13, s2 -; VI-NEXT: s_add_u32 s2, s0, 0x60 +; VI-NEXT: s_add_u32 s0, s0, 64 ; VI-NEXT: flat_store_dwordx4 v[11:12], v[4:7] -; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_cvt_f32_f16_sdwa v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f64_f32_e32 v[5:6], v8 ; VI-NEXT: v_cvt_f32_f16_e32 v8, v2 ; VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_e32 v7, v1 -; VI-NEXT: v_cvt_f32_f16_sdwa v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f64_f32_e32 v[3:4], v0 -; VI-NEXT: v_mov_b32_e32 v16, s3 -; VI-NEXT: v_mov_b32_e32 v15, s2 -; VI-NEXT: s_add_u32 s2, s0, 0x50 -; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v10 ; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 ; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v2 -; VI-NEXT: s_add_u32 s0, s0, 64 ; VI-NEXT: flat_store_dwordx4 v[13:14], v[3:6] ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 @@ -1675,34 +1658,34 @@ ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_add_u32 s4, s2, 16 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: s_add_u32 s2, s2, 16 +; SI-NEXT: s_addc_u32 s3, s3, 0 ; SI-NEXT: v_mov_b32_e32 v5, s3 -; SI-NEXT: s_addc_u32 s5, s3, 0 -; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v4, s2 -; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; SI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; SI-NEXT: v_mov_b32_e32 v8, s0 ; SI-NEXT: v_mov_b32_e32 v9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v6, v0 -; SI-NEXT: v_or_b32_e32 v0, v4, v5 -; SI-NEXT: v_or_b32_e32 v3, v2, v3 -; SI-NEXT: v_or_b32_e32 v2, v10, v7 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v10 +; SI-NEXT: v_or_b32_e32 v3, v6, v7 +; SI-NEXT: v_or_b32_e32 v2, v4, v5 ; SI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; SI-NEXT: s_endpgm ; @@ -1710,30 +1693,30 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s2, 16 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_add_u32 s2, s2, 16 +; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; VI-NEXT: v_mov_b32_e32 v8, s0 ; VI-NEXT: v_mov_b32_e32 v9, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; VI-NEXT: v_cvt_f16_f32_sdwa v10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f16_f32_sdwa v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; VI-NEXT: v_cvt_f16_f32_sdwa v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; VI-NEXT: v_cvt_f16_f32_sdwa v10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_cvt_f16_f32_e32 v11, v0 -; VI-NEXT: v_or_b32_e32 v1, v6, v7 -; VI-NEXT: v_or_b32_e32 v0, v4, v5 -; VI-NEXT: v_or_b32_e32 v3, v2, v3 -; VI-NEXT: v_or_b32_e32 v2, v11, v10 +; VI-NEXT: v_or_b32_e32 v1, v2, v3 +; VI-NEXT: v_or_b32_e32 v0, v0, v10 +; VI-NEXT: v_or_b32_e32 v3, v6, v7 +; VI-NEXT: v_or_b32_e32 v2, v4, v5 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; VI-NEXT: s_endpgm %val = load <8 x float>, <8 x float> addrspace(1)* %in @@ -1752,17 +1735,17 @@ ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_add_u32 s4, s2, 48 -; SI-NEXT: v_mov_b32_e32 v13, s3 ; SI-NEXT: s_addc_u32 s5, s3, 0 -; SI-NEXT: v_mov_b32_e32 v12, s2 +; SI-NEXT: v_mov_b32_e32 v9, s3 ; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v8, s2 ; SI-NEXT: s_add_u32 s2, s2, 16 -; SI-NEXT: v_mov_b32_e32 v5, s5 -; SI-NEXT: s_addc_u32 s3, s3, 0 ; SI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; SI-NEXT: v_mov_b32_e32 v5, s5 ; SI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; SI-NEXT: v_mov_b32_e32 v9, s3 -; SI-NEXT: v_mov_b32_e32 v8, s2 +; SI-NEXT: s_addc_u32 s3, s3, 0 +; SI-NEXT: v_mov_b32_e32 v13, s3 +; SI-NEXT: v_mov_b32_e32 v12, s2 ; SI-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; SI-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; SI-NEXT: s_add_u32 s2, s0, 16 @@ -1777,15 +1760,16 @@ ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_mov_b32_e32 v5, s3 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 @@ -1796,16 +1780,16 @@ ; SI-NEXT: v_or_b32_e32 v0, v0, v18 ; SI-NEXT: v_or_b32_e32 v3, v6, v2 ; SI-NEXT: v_or_b32_e32 v2, v17, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 ; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; SI-NEXT: v_mov_b32_e32 v5, s1 -; SI-NEXT: v_or_b32_e32 v1, v14, v6 -; SI-NEXT: v_or_b32_e32 v0, v12, v7 -; SI-NEXT: v_or_b32_e32 v3, v10, v11 -; SI-NEXT: v_or_b32_e32 v2, v8, v9 +; SI-NEXT: v_or_b32_e32 v1, v10, v6 +; SI-NEXT: v_or_b32_e32 v0, v8, v7 +; SI-NEXT: v_or_b32_e32 v3, v14, v9 +; SI-NEXT: v_or_b32_e32 v2, v12, v11 ; SI-NEXT: v_mov_b32_e32 v4, s0 ; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; SI-NEXT: s_endpgm @@ -1819,17 +1803,17 @@ ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_add_u32 s4, s2, 48 -; VI-NEXT: v_mov_b32_e32 v13, s3 ; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v12, s2 +; VI-NEXT: v_mov_b32_e32 v9, s3 ; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v8, s2 ; VI-NEXT: s_add_u32 s2, s2, 16 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; VI-NEXT: v_mov_b32_e32 v9, s3 -; VI-NEXT: v_mov_b32_e32 v8, s2 +; VI-NEXT: v_mov_b32_e32 v13, s3 +; VI-NEXT: v_mov_b32_e32 v12, s2 ; VI-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; VI-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; VI-NEXT: s_add_u32 s2, s0, 16 @@ -1844,15 +1828,16 @@ ; VI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; VI-NEXT: v_cvt_f16_f32_sdwa v17, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_f16_f32_e32 v18, v4 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cvt_f16_f32_sdwa v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; VI-NEXT: v_cvt_f16_f32_sdwa v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f16_f32_sdwa v15, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; VI-NEXT: v_cvt_f16_f32_sdwa v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; VI-NEXT: v_cvt_f16_f32_sdwa v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; VI-NEXT: v_cvt_f16_f32_sdwa v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: v_or_b32_e32 v1, v2, v3 @@ -1861,10 +1846,10 @@ ; VI-NEXT: v_or_b32_e32 v2, v18, v17 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_or_b32_e32 v1, v14, v15 -; VI-NEXT: v_or_b32_e32 v0, v12, v13 -; VI-NEXT: v_or_b32_e32 v3, v10, v11 -; VI-NEXT: v_or_b32_e32 v2, v8, v9 +; VI-NEXT: v_or_b32_e32 v1, v10, v11 +; VI-NEXT: v_or_b32_e32 v0, v8, v9 +; VI-NEXT: v_or_b32_e32 v3, v14, v15 +; VI-NEXT: v_or_b32_e32 v2, v12, v13 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -1894,8 +1879,8 @@ ; ; VI-LABEL: fadd_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -1936,17 +1921,17 @@ ; ; VI-LABEL: fadd_v2f16: ; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[4:5], 0xc +; VI-NEXT: s_load_dword s3, s[4:5], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dword s3, s[4:5], 0xc ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s5, s2, 16 -; VI-NEXT: s_lshr_b32 s4, s3, 16 +; VI-NEXT: s_lshr_b32 s4, s2, 16 +; VI-NEXT: s_lshr_b32 s5, s3, 16 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_f16_e32 v1, s2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_add_f16_e32 v1, s3, v1 ; VI-NEXT: v_or_b32_e32 v2, v1, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1962,12 +1947,12 @@ ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_add_u32 s4, s2, 8 ; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: s_addc_u32 s5, s3, 0 -; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_mov_b32_e32 v3, s5 +; SI-NEXT: s_add_u32 s2, s2, 8 +; SI-NEXT: s_addc_u32 s3, s3, 0 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v3, s3 ; SI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; SI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] ; SI-NEXT: v_mov_b32_e32 v4, s0 @@ -2035,8 +2020,8 @@ define amdgpu_kernel void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 { ; SI-LABEL: fadd_v8f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x8 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s10, s0, 16 @@ -2094,44 +2079,44 @@ ; ; VI-LABEL: fadd_v8f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x20 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s11, s3, 16 -; VI-NEXT: s_lshr_b32 s10, s7, 16 -; VI-NEXT: v_mov_b32_e32 v0, s10 -; VI-NEXT: v_mov_b32_e32 v1, s11 -; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s6, s3, 16 +; VI-NEXT: s_lshr_b32 s7, s11, 16 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_f16_e32 v1, s3, v1 -; VI-NEXT: s_lshr_b32 s3, s6, 16 -; VI-NEXT: s_lshr_b32 s7, s2, 16 +; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_f16_e32 v1, s11, v1 +; VI-NEXT: s_lshr_b32 s3, s2, 16 +; VI-NEXT: s_lshr_b32 s6, s10, 16 ; VI-NEXT: v_or_b32_e32 v3, v1, v0 ; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: v_add_f16_e32 v1, s2, v1 -; VI-NEXT: s_lshr_b32 s2, s5, 16 -; VI-NEXT: s_lshr_b32 s3, s1, 16 +; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_add_f16_e32 v1, s10, v1 +; VI-NEXT: s_lshr_b32 s2, s1, 16 +; VI-NEXT: s_lshr_b32 s3, s9, 16 ; VI-NEXT: v_or_b32_e32 v2, v1, v0 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_f16_e32 v1, s1, v1 -; VI-NEXT: s_lshr_b32 s1, s4, 16 -; VI-NEXT: s_lshr_b32 s2, s0, 16 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_f16_e32 v1, s9, v1 +; VI-NEXT: s_lshr_b32 s1, s0, 16 +; VI-NEXT: s_lshr_b32 s2, s8, 16 ; VI-NEXT: v_or_b32_e32 v1, v1, v0 ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: v_add_f16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_add_f16_e32 v4, s0, v4 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_add_f16_e32 v4, s8, v4 ; VI-NEXT: v_or_b32_e32 v0, v4, v0 -; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm %add = fadd <8 x half> %a, %b @@ -2146,11 +2131,11 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: flat_load_ushort v0, v[0:1] -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: flat_load_ushort v2, v[0:1] +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: flat_store_short v[2:3], v0 +; GCN-NEXT: flat_store_short v[0:1], v2 ; GCN-NEXT: s_endpgm %val = load half, half addrspace(1)* %in %val_int = bitcast half %val to i16 @@ -2163,9 +2148,9 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: v_mov_b32_e32 v3, s3 -; GCN-NEXT: flat_load_ushort v2, v[2:3] +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: flat_load_ushort v2, v[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -24,6 +24,7 @@ ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) @@ -32,9 +33,8 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s4 +; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s5 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -46,7 +46,6 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -55,6 +54,7 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -92,14 +92,14 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot2_u32_u16 v0, v3, v2, s0 -; GFX9-DL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2: @@ -185,7 +185,6 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -194,6 +193,7 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -323,7 +323,6 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -332,6 +331,7 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3 @@ -369,14 +369,14 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot2_i32_i16 v0, v3, v2, s0 -; GFX9-DL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-DL-NEXT: v_dot2_i32_i16 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2: @@ -457,7 +457,6 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -466,6 +465,7 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -577,6 +577,7 @@ ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) @@ -585,9 +586,8 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s4 +; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s5 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -599,7 +599,6 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -608,6 +607,7 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -631,16 +631,16 @@ ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NODL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_and_b32_e32 v3, s0, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_and_b32_e32 v4, s0, v2 -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s1 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v4, v3, v1 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm @@ -650,14 +650,14 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot2_u32_u16 v0, v3, v2, s0 -; GFX9-DL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_alt_AddOperands: @@ -738,7 +738,6 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -747,6 +746,7 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3 @@ -879,19 +879,19 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GFX8-NEXT: flat_load_dword v2, v[2:3] +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v1, v1, v1, s2 -; GFX8-NEXT: v_mad_u32_u24 v2, v0, v0, v1 +; GFX8-NEXT: v_mad_u32_u24 v0, v0, v0, s2 +; GFX8-NEXT: v_mad_u32_u24 v2, v1, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -998,17 +998,17 @@ ; GFX7-NEXT: s_mov_b64 s[6:7], s[10:11] ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v3, s4, v0 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, s4 +; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, s5 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v1, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -1066,14 +1066,14 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot2_u32_u16 v0, v3, v2, s0 -; GFX9-DL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_v4i16: @@ -1134,6 +1134,7 @@ ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4 +; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) @@ -1141,10 +1142,9 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v3, s4, v0 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, s4 +; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, s5 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v1, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -1156,7 +1156,6 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1169,6 +1168,7 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, s2, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -1206,14 +1206,14 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] offset:4 -; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] offset:4 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 +; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] offset:4 ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot2_u32_u16 v0, v3, v2, s0 -; GFX9-DL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_v4i16_Hi: @@ -1275,17 +1275,17 @@ ; GFX7-NEXT: s_mov_b64 s[6:7], s[10:11] ; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, s4 +; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, s5 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -1418,17 +1418,17 @@ ; GFX7-NEXT: s_mov_b64 s[6:7], s[10:11] ; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, s4 +; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, s5 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -1560,6 +1560,7 @@ ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) @@ -1568,9 +1569,8 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, v0, v1, s4 +; GFX7-NEXT: v_mad_u32_u24 v0, v0, v1, s5 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v2, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -1582,7 +1582,6 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1591,6 +1590,7 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -1725,7 +1725,6 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1734,6 +1733,7 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -1876,7 +1876,6 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1885,6 +1884,7 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3 @@ -2006,6 +2006,7 @@ ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) @@ -2014,9 +2015,8 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v4, v0, v2, s4 +; GFX7-NEXT: v_mad_u32_u24 v4, v0, v2, s5 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, v4 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -2029,7 +2029,6 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -2038,6 +2037,7 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -2189,7 +2189,6 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -2198,6 +2197,7 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3 @@ -2325,6 +2325,7 @@ ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) @@ -2332,10 +2333,9 @@ ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v4, v3, v1, s4 +; GFX7-NEXT: v_mad_u32_u24 v4, v3, v1, s5 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, v4 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -2348,7 +2348,6 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -2357,6 +2356,7 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -2503,7 +2503,6 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -2512,6 +2511,7 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3 @@ -2636,18 +2636,18 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: buffer_load_ushort v4, off, s[0:3], 0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, v4 +; GFX7-NEXT: v_mad_u32_u24 v1, v3, v4, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -2661,22 +2661,22 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: flat_load_ushort v6, v[2:3] +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ushort v4, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u16 v1, v1, v5, v6 -; GFX8-NEXT: v_mad_u16 v0, v4, v0, v1 -; GFX8-NEXT: flat_store_short v[2:3], v0 +; GFX8-NEXT: v_mad_u16 v4, v5, v6, v4 +; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 +; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot2_acc16: @@ -2688,13 +2688,13 @@ ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-NODL-NEXT: global_load_ushort v5, v1, s[2:3] +; GFX9-NODL-NEXT: global_load_ushort v4, v1, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v0, v4, v5 +; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v0, v5, v4 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v2, v3, v0 ; GFX9-NODL-NEXT: global_store_short v1, v0, s[2:3] ; GFX9-NODL-NEXT: s_endpgm @@ -2708,13 +2708,13 @@ ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ushort v5, v1, s[2:3] +; GFX9-DL-NEXT: global_load_ushort v4, v1, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v0, v4, v5 +; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v0, v5, v4 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v2, v3, v0 ; GFX9-DL-NEXT: global_store_short v1, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm @@ -2799,7 +2799,6 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -2808,6 +2807,7 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8 ; GFX8-NEXT: v_lshrrev_b16_e32 v3, 8, v3 diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -47,7 +47,6 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -56,6 +55,7 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8 ; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8 @@ -102,14 +102,14 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v0, v2, v3, s0 -; GFX9-DL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32: @@ -187,30 +187,30 @@ ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: buffer_load_ushort v8, off, s[0:3], 0 +; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8 -; GFX7-NEXT: v_bfe_i32 v3, v2, 8, 8 +; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 8 +; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_bfe_i32 v5, v0, 0, 8 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: v_bfe_i32 v6, v0, 8, 8 -; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 -; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 8 +; GFX7-NEXT: v_bfe_i32 v6, v0, 0, 8 ; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX7-NEXT: v_bfe_i32 v7, v0, 16, 8 +; GFX7-NEXT: v_bfe_i32 v7, v0, 8, 8 ; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v8 -; GFX7-NEXT: v_ashrrev_i32_e32 v2, 24, v2 +; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 8 ; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX7-NEXT: v_ashrrev_i32_e32 v0, 24, v0 +; GFX7-NEXT: v_bfe_i32 v8, v0, 16, 8 ; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1 +; GFX7-NEXT: v_ashrrev_i32_e32 v2, 24, v2 +; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX7-NEXT: v_ashrrev_i32_e32 v0, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -224,36 +224,36 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: flat_load_ushort v1, v[2:3] +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ushort v4, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_bfe_i32 v7, v4, 0, 8 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_bfe_i32 v7, v3, 0, 8 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX8-NEXT: v_bfe_i32 v9, v9, 0, 8 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 ; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 8 -; GFX8-NEXT: v_bfe_i32 v4, v4, 0, 8 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_bfe_i32 v8, v0, 0, 8 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX8-NEXT: v_bfe_i32 v8, v2, 0, 8 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX8-NEXT: v_bfe_i32 v10, v10, 0, 8 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u16 v1, v7, v8, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX8-NEXT: v_mad_u16 v4, v7, v8, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX8-NEXT: v_bfe_i32 v6, v6, 0, 8 -; GFX8-NEXT: v_mad_u16 v1, v9, v10, v1 -; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX8-NEXT: v_mad_u16 v1, v5, v6, v1 -; GFX8-NEXT: v_mad_u16 v0, v4, v0, v1 -; GFX8-NEXT: flat_store_short v[2:3], v0 +; GFX8-NEXT: v_mad_u16 v4, v9, v10, v4 +; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX8-NEXT: v_mad_u16 v4, v5, v6, v4 +; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 +; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot4_acc16: @@ -261,33 +261,33 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-NODL-NEXT: global_load_ushort v4, v1, s[2:3] +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) -; GFX9-NODL-NEXT: v_bfe_i32 v6, v2, 0, 8 +; GFX9-NODL-NEXT: v_bfe_i32 v6, v1, 0, 8 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_bfe_i32 v7, v3, 0, 8 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 8, v2 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 8, v3 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX9-NODL-NEXT: v_bfe_i32 v7, v2, 0, 8 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX9-NODL-NEXT: v_bfe_i32 v8, v8, 0, 8 ; GFX9-NODL-NEXT: v_bfe_i32 v9, v9, 0, 8 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_legacy_u16 v4, v6, v7, v4 +; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v6, v7, v3 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v3, 24, v3 -; GFX9-NODL-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX9-NODL-NEXT: v_bfe_i32 v4, v4, 0, 8 ; GFX9-NODL-NEXT: v_bfe_i32 v5, v5, 0, 8 -; GFX9-NODL-NEXT: v_mad_legacy_u16 v4, v8, v9, v4 +; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v8, v9, v3 +; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX9-NODL-NEXT: v_bfe_i32 v3, v3, 0, 8 -; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v0, v5, v4 -; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v2, v3, v0 -; GFX9-NODL-NEXT: global_store_short v1, v0, s[2:3] +; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v4, v5, v3 +; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 +; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc16: @@ -295,33 +295,33 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ushort v4, v1, s[2:3] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_bfe_i32 v6, v2, 0, 8 +; GFX9-DL-NEXT: v_bfe_i32 v6, v1, 0, 8 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_bfe_i32 v7, v3, 0, 8 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v3 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX9-DL-NEXT: v_bfe_i32 v7, v2, 0, 8 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX9-DL-NEXT: v_bfe_i32 v8, v8, 0, 8 ; GFX9-DL-NEXT: v_bfe_i32 v9, v9, 0, 8 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_legacy_u16 v4, v6, v7, v4 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v6, v7, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v3, 24, v3 -; GFX9-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX9-DL-NEXT: v_bfe_i32 v4, v4, 0, 8 ; GFX9-DL-NEXT: v_bfe_i32 v5, v5, 0, 8 -; GFX9-DL-NEXT: v_mad_legacy_u16 v4, v8, v9, v4 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v8, v9, v3 +; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX9-DL-NEXT: v_bfe_i32 v3, v3, 0, 8 -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v0, v5, v4 -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v2, v3, v0 -; GFX9-DL-NEXT: global_store_short v1, v0, s[2:3] +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v4, v5, v3 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc16: @@ -415,24 +415,24 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: buffer_load_ubyte v8, off, s[0:3], 0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 -; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v2 +; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v5, s4, v0 -; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 +; GFX7-NEXT: v_and_b32_e32 v6, s4, v0 +; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v8 -; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1 +; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 +; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8 +; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -446,27 +446,28 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_ubyte v5, v[2:3] -; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ubyte v4, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3 +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v0 -; GFX8-NEXT: v_mad_u16 v0, v4, v0, v5 -; GFX8-NEXT: v_mad_u16 v0, v7, v8, v0 -; GFX8-NEXT: v_mad_u16 v0, v1, v6, v0 -; GFX8-NEXT: v_mad_u16 v0, v9, v10, v0 -; GFX8-NEXT: flat_store_byte v[2:3], v0 +; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 +; GFX8-NEXT: v_mad_u16 v2, v7, v8, v2 +; GFX8-NEXT: v_mad_u16 v2, v5, v6, v2 +; GFX8-NEXT: v_mad_u16 v2, v9, v10, v2 +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot4_acc8: @@ -474,25 +475,25 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-NODL-NEXT: global_load_ubyte v4, v1, s[2:3] +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v3 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 24, v2 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 24, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_legacy_u16 v2, v2, v3, v4 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX9-NODL-NEXT: v_mad_legacy_u16 v2, v6, v7, v2 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 24, v3 -; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v0, v5, v2 -; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v8, v9, v0 -; GFX9-NODL-NEXT: global_store_byte v1, v0, s[2:3] +; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v6, v7, v1 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 24, v2 +; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 +; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v8, v9, v1 +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc8: @@ -500,25 +501,25 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[2:3] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v3 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 24, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 24, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v2, v3, v4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v6, v7, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 24, v3 -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v0, v5, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v8, v9, v0 -; GFX9-DL-NEXT: global_store_byte v1, v0, s[2:3] +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v6, v7, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 24, v2 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v8, v9, v1 +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc8: @@ -624,7 +625,6 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -633,6 +633,7 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8 ; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8 @@ -815,7 +816,6 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -824,6 +824,7 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v3 ; GFX8-NEXT: v_ashrrev_i32_e32 v4, 24, v3 @@ -861,11 +862,11 @@ ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v4, 8, v2 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v3), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v6, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 -; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v2, sext(v3), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v2, v5, s0, v2 +; GFX9-NODL-NEXT: v_add3_u32 v2, v5, s0, v3 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm @@ -885,11 +886,11 @@ ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_lshrrev_b16_e32 v4, 8, v2 ; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v3), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v6, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 -; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v2, sext(v3), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_add3_u32 v2, v5, s0, v2 +; GFX9-DL-NEXT: v_add3_u32 v2, v5, s0, v3 ; GFX9-DL-NEXT: v_add3_u32 v1, v2, v6, v1 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm @@ -962,36 +963,36 @@ ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: buffer_load_ushort v8, off, s[0:3], 0 +; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_bfe_i32 v1, v2, 8, 8 -; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_bfe_i32 v3, v2, 8, 8 +; GFX7-NEXT: v_bfe_i32 v4, v2, 0, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_bfe_i32 v5, v0, 8, 8 -; GFX7-NEXT: v_bfe_i32 v6, v0, 0, 8 -; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, s4, v6 -; GFX7-NEXT: v_bfe_i32 v7, v0, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_bfe_i32 v6, v0, 8, 8 +; GFX7-NEXT: v_bfe_i32 v7, v0, 0, 8 +; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: v_and_b32_e32 v6, s4, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX7-NEXT: v_bfe_i32 v8, v0, 16, 8 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX7-NEXT: v_and_b32_e32 v7, s4, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, v8 +; GFX7-NEXT: v_mad_u32_u24 v1, v3, v4, v1 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 24, v2 -; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 24, v0 -; GFX7-NEXT: v_mad_u32_u24 v1, v5, v7, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, v6, v8, v1 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_mad_u32_u24 v1, v4, v6, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, v5, v7, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -1005,32 +1006,32 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: flat_load_ushort v1, v[2:3] +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ushort v4, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_ashrrev_i16_e32 v7, 8, v4 -; GFX8-NEXT: v_bfe_i32 v4, v4, 0, 8 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX8-NEXT: v_ashrrev_i16_e32 v7, 8, v3 +; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8 ; GFX8-NEXT: v_ashrrev_i16_e32 v9, 8, v5 ; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 8 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX8-NEXT: v_ashrrev_i16_e32 v8, 8, v0 -; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX8-NEXT: v_ashrrev_i16_e32 v8, 8, v2 +; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u16 v0, v4, v0, v1 +; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 ; GFX8-NEXT: v_ashrrev_i16_e32 v10, 8, v6 ; GFX8-NEXT: v_bfe_i32 v6, v6, 0, 8 -; GFX8-NEXT: v_mad_u16 v0, v7, v8, v0 -; GFX8-NEXT: v_mad_u16 v0, v5, v6, v0 -; GFX8-NEXT: v_mad_u16 v0, v9, v10, v0 -; GFX8-NEXT: flat_store_short v[2:3], v0 +; GFX8-NEXT: v_mad_u16 v2, v7, v8, v2 +; GFX8-NEXT: v_mad_u16 v2, v5, v6, v2 +; GFX8-NEXT: v_mad_u16 v2, v9, v10, v2 +; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot4_acc16_vecMul: diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -21,6 +21,7 @@ ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 ; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) @@ -28,14 +29,13 @@ ; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, s4, v0 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, s5 ; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 ; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, s4 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 @@ -49,7 +49,6 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_movk_i32 s2, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -58,6 +57,7 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 ; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8 @@ -104,14 +104,14 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, s0 -; GFX9-DL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc32: @@ -186,24 +186,24 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: buffer_load_ushort v8, off, s[0:3], 0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 -; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v2 +; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v5, s4, v0 -; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 +; GFX7-NEXT: v_and_b32_e32 v6, s4, v0 +; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v8 -; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1 +; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 +; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8 +; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -217,34 +217,34 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: flat_load_ushort v1, v[2:3] +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ushort v4, v[0:1] ; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: v_mov_b32_e32 v5, s0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_and_b32_e32 v6, s0, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v4 +; GFX8-NEXT: v_and_b32_e32 v6, s0, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 ; GFX8-NEXT: v_and_b32_e32 v8, s0, v8 -; GFX8-NEXT: v_and_b32_sdwa v10, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4 +; GFX8-NEXT: v_and_b32_sdwa v10, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v7, s0, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v0 +; GFX8-NEXT: v_and_b32_e32 v7, s0, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v2 ; GFX8-NEXT: v_and_b32_e32 v9, s0, v9 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u16 v1, v6, v7, v1 -; GFX8-NEXT: v_and_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_mad_u16 v1, v8, v9, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX8-NEXT: v_mad_u16 v1, v10, v5, v1 -; GFX8-NEXT: v_mad_u16 v0, v4, v0, v1 -; GFX8-NEXT: flat_store_short v[2:3], v0 +; GFX8-NEXT: v_mad_u16 v4, v6, v7, v4 +; GFX8-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mad_u16 v4, v8, v9, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2 +; GFX8-NEXT: v_mad_u16 v4, v10, v5, v4 +; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 +; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot4_acc16: @@ -252,30 +252,30 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-NODL-NEXT: global_load_ushort v4, v1, s[2:3] +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) -; GFX9-NODL-NEXT: v_and_b32_e32 v0, s0, v2 +; GFX9-NODL-NEXT: v_and_b32_e32 v4, s0, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_and_b32_e32 v5, s0, v3 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX9-NODL-NEXT: v_and_b32_e32 v5, s0, v2 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 ; GFX9-NODL-NEXT: v_and_b32_e32 v6, s0, v6 ; GFX9-NODL-NEXT: v_and_b32_e32 v7, s0, v7 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v0, v5, v4 -; GFX9-NODL-NEXT: v_and_b32_sdwa v8, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v3, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v6, v7, v0 +; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v4, v5, v3 +; GFX9-NODL-NEXT: v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v6, v7, v3 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v3, 24, v3 -; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v8, v9, v0 -; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v2, v3, v0 -; GFX9-NODL-NEXT: global_store_short v1, v0, s[2:3] +; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v8, v9, v3 +; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 +; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc16: @@ -283,30 +283,30 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ushort v4, v1, s[2:3] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_and_b32_e32 v0, s0, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v4, s0, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_and_b32_e32 v5, s0, v3 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v5, s0, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v6, s0, v6 ; GFX9-DL-NEXT: v_and_b32_e32 v7, s0, v7 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v0, v5, v4 -; GFX9-DL-NEXT: v_and_b32_sdwa v8, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-DL-NEXT: v_and_b32_sdwa v9, v3, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v6, v7, v0 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v4, v5, v3 +; GFX9-DL-NEXT: v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v6, v7, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v3, 24, v3 -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v8, v9, v0 -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v2, v3, v0 -; GFX9-DL-NEXT: global_store_short v1, v0, s[2:3] +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v8, v9, v3 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc16: @@ -398,24 +398,24 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: buffer_load_ubyte v8, off, s[0:3], 0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 -; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v2 +; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v5, s4, v0 -; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 +; GFX7-NEXT: v_and_b32_e32 v6, s4, v0 +; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v8 -; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1 +; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 +; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8 +; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -429,27 +429,28 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_ubyte v5, v[2:3] -; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ubyte v4, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3 +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v0 -; GFX8-NEXT: v_mad_u16 v0, v4, v0, v5 -; GFX8-NEXT: v_mad_u16 v0, v7, v8, v0 -; GFX8-NEXT: v_mad_u16 v0, v1, v6, v0 -; GFX8-NEXT: v_mad_u16 v0, v9, v10, v0 -; GFX8-NEXT: flat_store_byte v[2:3], v0 +; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 +; GFX8-NEXT: v_mad_u16 v2, v7, v8, v2 +; GFX8-NEXT: v_mad_u16 v2, v5, v6, v2 +; GFX8-NEXT: v_mad_u16 v2, v9, v10, v2 +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot4_acc8: @@ -457,25 +458,25 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-NODL-NEXT: global_load_ubyte v4, v1, s[2:3] +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v3 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 24, v2 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 24, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_legacy_u16 v2, v2, v3, v4 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX9-NODL-NEXT: v_mad_legacy_u16 v2, v6, v7, v2 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 24, v3 -; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v0, v5, v2 -; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v8, v9, v0 -; GFX9-NODL-NEXT: global_store_byte v1, v0, s[2:3] +; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v6, v7, v1 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 24, v2 +; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 +; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v8, v9, v1 +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc8: @@ -483,25 +484,25 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[2:3] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v3 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 24, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 24, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v2, v3, v4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v6, v7, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 24, v3 -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v0, v5, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v8, v9, v0 -; GFX9-DL-NEXT: global_store_byte v1, v0, s[2:3] +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v6, v7, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 24, v2 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v8, v9, v1 +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc8: @@ -581,18 +582,18 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: buffer_load_ubyte v4, off, s[0:3], 0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v2 ; GFX7-NEXT: v_bfe_u32 v2, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v3, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v4, s4, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, v4 +; GFX7-NEXT: v_mad_u32_u24 v1, v3, v4, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -606,21 +607,22 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_ubyte v5, v[2:3] -; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ubyte v4, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v3 +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v0 -; GFX8-NEXT: v_mad_u16 v0, v4, v0, v5 -; GFX8-NEXT: v_mad_u16 v0, v1, v6, v0 -; GFX8-NEXT: flat_store_byte v[2:3], v0 +; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 +; GFX8-NEXT: v_mad_u16 v2, v5, v6, v2 +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot2_8: @@ -722,24 +724,24 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: buffer_load_ubyte v8, off, s[0:3], 0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 -; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v2 +; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v5, s4, v0 -; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 +; GFX7-NEXT: v_and_b32_e32 v6, s4, v0 +; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v5, v1, v8 -; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 ; GFX7-NEXT: v_mad_u32_u24 v1, v6, v3, v1 +; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 +; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8 +; GFX7-NEXT: v_mad_u32_u24 v1, v7, v4, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX7-NEXT: v_mad_u32_u24 v1, v7, v4, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, v8, v5, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -753,27 +755,28 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_ubyte v5, v[2:3] -; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ubyte v4, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3 +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v0 -; GFX8-NEXT: v_mad_u16 v0, v0, v4, v5 -; GFX8-NEXT: v_mad_u16 v0, v8, v7, v0 -; GFX8-NEXT: v_mad_u16 v0, v6, v1, v0 -; GFX8-NEXT: v_mad_u16 v0, v10, v9, v0 -; GFX8-NEXT: flat_store_byte v[2:3], v0 +; GFX8-NEXT: v_mad_u16 v2, v2, v3, v4 +; GFX8-NEXT: v_mad_u16 v2, v8, v7, v2 +; GFX8-NEXT: v_mad_u16 v2, v6, v5, v2 +; GFX8-NEXT: v_mad_u16 v2, v10, v9, v2 +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot4_CommutationInsideMAD: @@ -781,25 +784,25 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-NODL-NEXT: global_load_ubyte v4, v1, s[2:3] +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v3 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 24, v2 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 24, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_legacy_u16 v2, v3, v2, v4 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX9-NODL-NEXT: v_mad_legacy_u16 v2, v7, v6, v2 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 24, v3 -; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v5, v0, v2 -; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v9, v8, v0 -; GFX9-NODL-NEXT: global_store_byte v1, v0, s[2:3] +; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v2, v1, v3 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v7, v6, v1 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 24, v2 +; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v5, v4, v1 +; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v9, v8, v1 +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_CommutationInsideMAD: @@ -807,25 +810,25 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[2:3] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v3 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 24, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 24, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v3, v2, v4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v7, v6, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 24, v3 -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v5, v0, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v9, v8, v0 -; GFX9-DL-NEXT: global_store_byte v1, v0, s[2:3] +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v2, v1, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v7, v6, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 24, v2 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v4, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v9, v8, v1 +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_CommutationInsideMAD: @@ -905,24 +908,24 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: buffer_load_ubyte v8, off, s[0:3], 0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 +; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v2 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 -; GFX7-NEXT: v_and_b32_e32 v5, s4, v0 +; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 +; GFX7-NEXT: v_and_b32_e32 v6, s4, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v3, v6, v3, v8 -; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 -; GFX7-NEXT: v_mad_u32_u24 v1, v5, v1, v3 +; GFX7-NEXT: v_mad_u32_u24 v1, v7, v4, v1 +; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 +; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8 +; GFX7-NEXT: v_mad_u32_u24 v1, v6, v3, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX7-NEXT: v_mad_u32_u24 v1, v7, v4, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, v8, v5, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -936,28 +939,28 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: flat_load_ubyte v10, v[2:3] +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ubyte v4, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u16 v6, v7, v6, v10 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v0 -; GFX8-NEXT: v_mad_u16 v0, v0, v4, v6 -; GFX8-NEXT: v_mad_u16 v0, v5, v1, v0 -; GFX8-NEXT: v_mad_u16 v0, v9, v8, v0 -; GFX8-NEXT: flat_store_byte v[2:3], v0 +; GFX8-NEXT: v_mad_u16 v4, v8, v7, v4 +; GFX8-NEXT: v_mad_u16 v2, v2, v3, v4 +; GFX8-NEXT: v_mad_u16 v2, v6, v5, v2 +; GFX8-NEXT: v_mad_u16 v2, v10, v9, v2 +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot4_CommutationAccrossMADs: @@ -965,25 +968,25 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-NODL-NEXT: global_load_ubyte v9, v1, s[2:3] +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v3 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_legacy_u16 v5, v6, v5, v9 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 24, v2 -; GFX9-NODL-NEXT: v_mad_legacy_u16 v2, v3, v2, v5 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 24, v3 -; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v4, v0, v2 -; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v8, v7, v0 -; GFX9-NODL-NEXT: global_store_byte v1, v0, s[2:3] +; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v7, v6, v3 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v2, v1, v3 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 24, v2 +; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v5, v4, v1 +; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v9, v8, v1 +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_CommutationAccrossMADs: @@ -991,25 +994,25 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ubyte v9, v1, s[2:3] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_legacy_u16 v5, v6, v5, v9 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 24, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v3, v2, v5 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 24, v3 -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v4, v0, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v8, v7, v0 -; GFX9-DL-NEXT: global_store_byte v1, v0, s[2:3] +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v7, v6, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v2, v1, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 24, v2 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v4, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v9, v8, v1 +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_CommutationAccrossMADs: @@ -1089,6 +1092,7 @@ ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 ; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) @@ -1096,15 +1100,14 @@ ; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, s4, v0 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 -; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 -; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v8, v1, v5, s4 +; GFX7-NEXT: v_mad_u32_u24 v8, v1, v5, s5 ; GFX7-NEXT: v_mad_u32_u24 v3, v3, v6, v8 +; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 +; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 @@ -1118,7 +1121,6 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_movk_i32 s2, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1127,6 +1129,7 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 ; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8 @@ -1315,7 +1318,6 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_movk_i32 s2, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1324,6 +1326,7 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8 ; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 @@ -1479,8 +1482,8 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_waitcnt vmcnt(2) @@ -1508,38 +1511,38 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_movk_i32 s2, 0xff -; GFX8-NEXT: v_mov_b32_e32 v5, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_load_ushort v10, v[2:3] +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ushort v4, v[0:1] +; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: v_mov_b32_e32 v5, s0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v4 -; GFX8-NEXT: v_and_b32_e32 v7, s2, v7 -; GFX8-NEXT: v_bfe_i32 v1, v4, 0, 8 -; GFX8-NEXT: v_and_b32_sdwa v9, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 +; GFX8-NEXT: v_and_b32_e32 v8, s0, v8 +; GFX8-NEXT: v_bfe_i32 v6, v3, 0, 8 +; GFX8-NEXT: v_and_b32_sdwa v10, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v0 -; GFX8-NEXT: v_and_b32_e32 v8, s2, v8 -; GFX8-NEXT: v_bfe_i32 v6, v0, 0, 8 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX8-NEXT: v_and_b32_e32 v9, s0, v9 +; GFX8-NEXT: v_bfe_i32 v7, v2, 0, 8 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u16 v7, v7, v8, v10 -; GFX8-NEXT: v_and_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_mad_u16 v1, v1, v6, v7 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX8-NEXT: v_mad_u16 v1, v9, v5, v1 -; GFX8-NEXT: v_mad_u16 v0, v4, v0, v1 -; GFX8-NEXT: flat_store_short v[2:3], v0 +; GFX8-NEXT: v_mad_u16 v4, v8, v9, v4 +; GFX8-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mad_u16 v4, v6, v7, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2 +; GFX8-NEXT: v_mad_u16 v4, v10, v5, v4 +; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 +; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: notdot4_mixedtypes: @@ -1547,30 +1550,30 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-NODL-NEXT: global_load_ushort v9, v1, s[2:3] +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v3 -; GFX9-NODL-NEXT: v_and_b32_e32 v5, s0, v5 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 ; GFX9-NODL-NEXT: v_and_b32_e32 v6, s0, v6 -; GFX9-NODL-NEXT: v_bfe_i32 v0, v2, 0, 8 -; GFX9-NODL-NEXT: v_bfe_i32 v4, v3, 0, 8 +; GFX9-NODL-NEXT: v_and_b32_e32 v7, s0, v7 +; GFX9-NODL-NEXT: v_bfe_i32 v4, v1, 0, 8 +; GFX9-NODL-NEXT: v_bfe_i32 v5, v2, 0, 8 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_legacy_u16 v5, v5, v6, v9 -; GFX9-NODL-NEXT: v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NODL-NEXT: v_and_b32_sdwa v8, v3, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v0, v4, v5 +; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v6, v7, v3 +; GFX9-NODL-NEXT: v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v4, v5, v3 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v3, 24, v3 -; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v7, v8, v0 -; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v2, v3, v0 -; GFX9-NODL-NEXT: global_store_short v1, v0, s[2:3] +; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v8, v9, v3 +; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 +; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notdot4_mixedtypes: @@ -1578,30 +1581,30 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ushort v9, v1, s[2:3] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v3 -; GFX9-DL-NEXT: v_and_b32_e32 v5, s0, v5 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v6, s0, v6 -; GFX9-DL-NEXT: v_bfe_i32 v0, v2, 0, 8 -; GFX9-DL-NEXT: v_bfe_i32 v4, v3, 0, 8 +; GFX9-DL-NEXT: v_and_b32_e32 v7, s0, v7 +; GFX9-DL-NEXT: v_bfe_i32 v4, v1, 0, 8 +; GFX9-DL-NEXT: v_bfe_i32 v5, v2, 0, 8 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_legacy_u16 v5, v5, v6, v9 -; GFX9-DL-NEXT: v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-DL-NEXT: v_and_b32_sdwa v8, v3, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v0, v4, v5 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v6, v7, v3 +; GFX9-DL-NEXT: v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v4, v5, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v3, 24, v3 -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v7, v8, v0 -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v2, v3, v0 -; GFX9-DL-NEXT: global_store_short v1, v0, s[2:3] +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v8, v9, v3 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notdot4_mixedtypes: @@ -1695,6 +1698,7 @@ ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 ; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) @@ -1707,9 +1711,8 @@ ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 ; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, s4 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, s5 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v7, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v1, v5, v0 @@ -1723,7 +1726,6 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_movk_i32 s2, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1732,6 +1734,7 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v3 ; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 8 @@ -1864,31 +1867,31 @@ ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: buffer_load_ushort v8, off, s[0:3], 0 +; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_mov_b32 s4, 0xff00 ; GFX7-NEXT: s_movk_i32 s5, 0xff ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX7-NEXT: v_and_b32_e32 v3, s5, v2 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v4, s5, v2 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v5, s4, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v6, s5, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v5 -; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v1, s5, v1 +; GFX7-NEXT: v_and_b32_e32 v6, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX7-NEXT: v_and_b32_e32 v7, s5, v0 +; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v3, s5, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s5, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, v8 -; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 -; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 -; GFX7-NEXT: v_mad_u32_u24 v1, v5, v6, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, v3, v4, v1 +; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 +; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8 +; GFX7-NEXT: v_mad_u32_u24 v1, v6, v7, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -1898,36 +1901,36 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_movk_i32 s2, 0xff -; GFX8-NEXT: v_mov_b32_e32 v5, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: flat_load_ushort v10, v[2:3] +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ushort v4, v[0:1] +; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: v_mov_b32_e32 v5, s0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v4 -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 8, v4 -; GFX8-NEXT: v_and_b32_sdwa v9, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3 +; GFX8-NEXT: v_lshrrev_b16_e32 v7, 8, v3 +; GFX8-NEXT: v_and_b32_sdwa v10, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v8, 8, v0 -; GFX8-NEXT: v_and_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v2 +; GFX8-NEXT: v_lshrrev_b16_e32 v9, 8, v2 +; GFX8-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u16 v0, v4, v0, v10 -; GFX8-NEXT: v_mad_u16 v0, v6, v8, v0 -; GFX8-NEXT: v_mad_u16 v0, v9, v5, v0 -; GFX8-NEXT: v_mad_u16 v0, v1, v7, v0 -; GFX8-NEXT: flat_store_short v[2:3], v0 +; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 +; GFX8-NEXT: v_mad_u16 v2, v7, v9, v2 +; GFX8-NEXT: v_mad_u16 v2, v10, v5, v2 +; GFX8-NEXT: v_mad_u16 v2, v6, v8, v2 +; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot4_acc16_vecMul: @@ -2089,25 +2092,25 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: buffer_load_ubyte v8, off, s[0:3], 0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v3, s4, v2 -; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v2 +; GFX7-NEXT: v_and_b32_e32 v4, s4, v2 +; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v6, s4, v0 -; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 +; GFX7-NEXT: v_and_b32_e32 v7, s4, v0 +; GFX7-NEXT: v_bfe_u32 v8, v0, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v3, v3, v6, v8 +; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_mad_u32_u24 v3, v4, v7, v3 -; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v3 -; GFX7-NEXT: v_mad_u32_u24 v0, v1, v5, v0 +; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -174,15 +174,15 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot8_i32_i4 v0, v2, v3, s0 -; GFX9-DL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v1, v2, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-XNACK-LABEL: idot8_acc32: @@ -343,51 +343,51 @@ ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: buffer_load_ushort v16, off, s[0:3], 0 +; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 4 -; GFX7-NEXT: v_bfe_i32 v3, v2, 4, 4 +; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 4 +; GFX7-NEXT: v_bfe_i32 v4, v2, 4, 4 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_bfe_i32 v9, v0, 0, 4 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: v_bfe_i32 v10, v0, 4, 4 -; GFX7-NEXT: v_and_b32_e32 v9, s4, v9 -; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 4 +; GFX7-NEXT: v_bfe_i32 v10, v0, 0, 4 ; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX7-NEXT: v_bfe_i32 v11, v0, 8, 4 +; GFX7-NEXT: v_bfe_i32 v11, v0, 4, 4 ; GFX7-NEXT: v_and_b32_e32 v10, s4, v10 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v1, v9, v16 -; GFX7-NEXT: v_bfe_i32 v5, v2, 12, 4 +; GFX7-NEXT: v_bfe_i32 v5, v2, 8, 4 ; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX7-NEXT: v_bfe_i32 v12, v0, 12, 4 +; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4 ; GFX7-NEXT: v_and_b32_e32 v11, s4, v11 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1 -; GFX7-NEXT: v_bfe_i32 v6, v2, 16, 4 +; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4 ; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 -; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4 +; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4 ; GFX7-NEXT: v_and_b32_e32 v12, s4, v12 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1 -; GFX7-NEXT: v_bfe_i32 v7, v2, 20, 4 +; GFX7-NEXT: v_bfe_i32 v7, v2, 16, 4 ; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 -; GFX7-NEXT: v_bfe_i32 v14, v0, 20, 4 +; GFX7-NEXT: v_bfe_i32 v14, v0, 16, 4 ; GFX7-NEXT: v_and_b32_e32 v13, s4, v13 ; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1 -; GFX7-NEXT: v_bfe_i32 v8, v2, 24, 4 +; GFX7-NEXT: v_bfe_i32 v8, v2, 20, 4 ; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 -; GFX7-NEXT: v_bfe_i32 v15, v0, 24, 4 +; GFX7-NEXT: v_bfe_i32 v15, v0, 20, 4 ; GFX7-NEXT: v_and_b32_e32 v14, s4, v14 ; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 -; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2 +; GFX7-NEXT: v_bfe_i32 v9, v2, 24, 4 ; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 -; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0 +; GFX7-NEXT: v_bfe_i32 v16, v0, 24, 4 ; GFX7-NEXT: v_and_b32_e32 v15, s4, v15 ; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 +; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2 +; GFX7-NEXT: v_and_b32_e32 v9, s4, v9 +; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0 +; GFX7-NEXT: v_and_b32_e32 v16, s4, v16 +; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -916,51 +916,51 @@ ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: buffer_load_ubyte v16, off, s[0:3], 0 +; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 4 -; GFX7-NEXT: v_bfe_i32 v3, v2, 4, 4 +; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 4 +; GFX7-NEXT: v_bfe_i32 v4, v2, 4, 4 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_bfe_i32 v9, v0, 0, 4 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: v_bfe_i32 v10, v0, 4, 4 -; GFX7-NEXT: v_and_b32_e32 v9, s4, v9 -; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 4 +; GFX7-NEXT: v_bfe_i32 v10, v0, 0, 4 ; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX7-NEXT: v_bfe_i32 v11, v0, 8, 4 +; GFX7-NEXT: v_bfe_i32 v11, v0, 4, 4 ; GFX7-NEXT: v_and_b32_e32 v10, s4, v10 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v1, v9, v16 -; GFX7-NEXT: v_bfe_i32 v5, v2, 12, 4 +; GFX7-NEXT: v_bfe_i32 v5, v2, 8, 4 ; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX7-NEXT: v_bfe_i32 v12, v0, 12, 4 +; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4 ; GFX7-NEXT: v_and_b32_e32 v11, s4, v11 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1 -; GFX7-NEXT: v_bfe_i32 v6, v2, 16, 4 +; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4 ; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 -; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4 +; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4 ; GFX7-NEXT: v_and_b32_e32 v12, s4, v12 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1 -; GFX7-NEXT: v_bfe_i32 v7, v2, 20, 4 +; GFX7-NEXT: v_bfe_i32 v7, v2, 16, 4 ; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 -; GFX7-NEXT: v_bfe_i32 v14, v0, 20, 4 +; GFX7-NEXT: v_bfe_i32 v14, v0, 16, 4 ; GFX7-NEXT: v_and_b32_e32 v13, s4, v13 ; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1 -; GFX7-NEXT: v_bfe_i32 v8, v2, 24, 4 +; GFX7-NEXT: v_bfe_i32 v8, v2, 20, 4 ; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 -; GFX7-NEXT: v_bfe_i32 v15, v0, 24, 4 +; GFX7-NEXT: v_bfe_i32 v15, v0, 20, 4 ; GFX7-NEXT: v_and_b32_e32 v14, s4, v14 ; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 -; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2 +; GFX7-NEXT: v_bfe_i32 v9, v2, 24, 4 ; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 -; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0 +; GFX7-NEXT: v_bfe_i32 v16, v0, 24, 4 ; GFX7-NEXT: v_and_b32_e32 v15, s4, v15 ; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 +; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2 +; GFX7-NEXT: v_and_b32_e32 v9, s4, v9 +; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0 +; GFX7-NEXT: v_and_b32_e32 v16, s4, v16 +; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -1968,43 +1968,43 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000 ; GFX8-NEXT: s_add_u32 s8, s8, s3 ; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_ashrrev_i32_e32 v2, 28, v0 -; GFX8-NEXT: v_bfe_i32 v3, v0, 24, 4 -; GFX8-NEXT: v_bfe_i32 v4, v0, 20, 4 -; GFX8-NEXT: v_bfe_i32 v5, v0, 16, 4 -; GFX8-NEXT: v_bfe_i32 v6, v0, 12, 4 -; GFX8-NEXT: v_bfe_i32 v7, v0, 8, 4 -; GFX8-NEXT: v_bfe_i32 v8, v0, 4, 4 -; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 4 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 28, v3 +; GFX8-NEXT: v_bfe_i32 v2, v3, 24, 4 +; GFX8-NEXT: v_bfe_i32 v4, v3, 20, 4 +; GFX8-NEXT: v_bfe_i32 v5, v3, 16, 4 +; GFX8-NEXT: v_bfe_i32 v6, v3, 12, 4 +; GFX8-NEXT: v_bfe_i32 v7, v3, 8, 4 +; GFX8-NEXT: v_bfe_i32 v8, v3, 4, 4 +; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 4 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_ashrrev_i32_e32 v9, 28, v1 -; GFX8-NEXT: v_bfe_i32 v10, v1, 24, 4 -; GFX8-NEXT: v_bfe_i32 v11, v1, 20, 4 -; GFX8-NEXT: v_bfe_i32 v12, v1, 16, 4 -; GFX8-NEXT: v_bfe_i32 v13, v1, 12, 4 -; GFX8-NEXT: v_bfe_i32 v14, v1, 8, 4 -; GFX8-NEXT: v_bfe_i32 v15, v1, 4, 4 -; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 4 +; GFX8-NEXT: v_ashrrev_i32_e32 v9, 28, v0 +; GFX8-NEXT: v_bfe_i32 v10, v0, 24, 4 +; GFX8-NEXT: v_bfe_i32 v11, v0, 20, 4 +; GFX8-NEXT: v_bfe_i32 v12, v0, 16, 4 +; GFX8-NEXT: v_bfe_i32 v13, v0, 12, 4 +; GFX8-NEXT: v_bfe_i32 v14, v0, 8, 4 +; GFX8-NEXT: v_bfe_i32 v15, v0, 4, 4 +; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v0, v0, v1, s2 +; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, s2 ; GFX8-NEXT: v_mad_i32_i24 v0, v8, v15, v0 ; GFX8-NEXT: v_mad_i32_i24 v0, v7, v14, v0 ; GFX8-NEXT: v_mad_i32_i24 v0, v6, v13, v0 ; GFX8-NEXT: v_mad_i32_i24 v0, v5, v12, v0 ; GFX8-NEXT: v_mad_i32_i24 v0, v4, v11, v0 -; GFX8-NEXT: v_mad_i32_i24 v0, v3, v10, v0 -; GFX8-NEXT: v_mad_i32_i24 v2, v2, v9, v0 +; GFX8-NEXT: v_mad_i32_i24 v0, v2, v10, v0 +; GFX8-NEXT: v_mad_i32_i24 v2, v1, v9, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -2070,15 +2070,15 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot8_i32_i4 v0, v2, v3, s0 -; GFX9-DL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v1, v2, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-XNACK-LABEL: idot8_acc32_vecMul: @@ -2203,64 +2203,64 @@ ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: buffer_load_ushort v16, off, s[0:3], 0 +; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_bfe_i32 v1, v2, 20, 4 -; GFX7-NEXT: v_bfe_i32 v3, v2, 16, 4 -; GFX7-NEXT: v_bfe_i32 v4, v2, 4, 4 -; GFX7-NEXT: v_bfe_i32 v5, v2, 0, 4 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX7-NEXT: v_bfe_i32 v3, v2, 20, 4 +; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 4 +; GFX7-NEXT: v_bfe_i32 v5, v2, 4, 4 +; GFX7-NEXT: v_bfe_i32 v6, v2, 0, 4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_bfe_i32 v9, v0, 20, 4 -; GFX7-NEXT: v_bfe_i32 v10, v0, 16, 4 -; GFX7-NEXT: v_bfe_i32 v11, v0, 4, 4 -; GFX7-NEXT: v_bfe_i32 v12, v0, 0, 4 -; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; GFX7-NEXT: v_and_b32_e32 v5, s4, v10 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; GFX7-NEXT: v_and_b32_e32 v10, s4, v12 -; GFX7-NEXT: v_bfe_i32 v13, v0, 24, 4 -; GFX7-NEXT: v_ashrrev_i32_e32 v15, 28, v0 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: v_or_b32_e32 v5, v10, v9 +; GFX7-NEXT: v_bfe_i32 v10, v0, 20, 4 +; GFX7-NEXT: v_bfe_i32 v11, v0, 16, 4 +; GFX7-NEXT: v_bfe_i32 v12, v0, 4, 4 +; GFX7-NEXT: v_bfe_i32 v13, v0, 0, 4 +; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v10 +; GFX7-NEXT: v_and_b32_e32 v6, s4, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v12 ; GFX7-NEXT: v_and_b32_e32 v11, s4, v13 +; GFX7-NEXT: v_bfe_i32 v14, v0, 24, 4 +; GFX7-NEXT: v_ashrrev_i32_e32 v16, 28, v0 +; GFX7-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX7-NEXT: v_or_b32_e32 v6, v11, v10 +; GFX7-NEXT: v_and_b32_e32 v12, s4, v14 +; GFX7-NEXT: v_and_b32_e32 v14, s4, v16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX7-NEXT: v_bfe_i32 v8, v2, 8, 4 +; GFX7-NEXT: v_bfe_i32 v15, v0, 8, 4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mad_u32_u24 v1, v4, v6, v1 +; GFX7-NEXT: v_bfe_i32 v7, v2, 24, 4 +; GFX7-NEXT: v_ashrrev_i32_e32 v9, 28, v2 +; GFX7-NEXT: v_bfe_i32 v2, v2, 12, 4 +; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX7-NEXT: v_bfe_i32 v0, v0, 12, 4 ; GFX7-NEXT: v_and_b32_e32 v13, s4, v15 +; GFX7-NEXT: v_mad_u32_u24 v1, v16, v11, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_mad_u32_u24 v1, v8, v13, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 -; GFX7-NEXT: v_bfe_i32 v7, v2, 8, 4 -; GFX7-NEXT: v_bfe_i32 v14, v0, 8, 4 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v3, v3, v5, v16 -; GFX7-NEXT: v_bfe_i32 v6, v2, 24, 4 -; GFX7-NEXT: v_ashrrev_i32_e32 v8, 28, v2 -; GFX7-NEXT: v_bfe_i32 v2, v2, 12, 4 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, v3, v5, v0 ; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 -; GFX7-NEXT: v_bfe_i32 v0, v0, 12, 4 -; GFX7-NEXT: v_and_b32_e32 v12, s4, v14 -; GFX7-NEXT: v_mad_u32_u24 v3, v15, v10, v3 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_mad_u32_u24 v3, v7, v12, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v3 -; GFX7-NEXT: v_mad_u32_u24 v0, v1, v4, v0 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 -; GFX7-NEXT: v_mad_u32_u24 v0, v14, v9, v0 -; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 -; GFX7-NEXT: v_mad_u32_u24 v0, v6, v11, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v8, v13, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v15, v10, v0 +; GFX7-NEXT: v_and_b32_e32 v9, s4, v9 +; GFX7-NEXT: v_mad_u32_u24 v0, v7, v12, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v9, v14, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -2354,77 +2354,77 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[4:5] -; GFX9-NEXT: global_load_dword v4, v0, s[6:7] -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_and_b32_e32 v11, 15, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_e32 v10, 15, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v17, 15, v4 -; GFX9-NEXT: v_bfe_u32 v0, v3, 24, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v3 -; GFX9-NEXT: v_bfe_u32 v6, v3, 16, 4 -; GFX9-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX9-NEXT: v_bfe_u32 v8, v3, 8, 4 -; GFX9-NEXT: v_bfe_u32 v9, v3, 12, 4 -; GFX9-NEXT: v_bfe_u32 v3, v3, 4, 4 -; GFX9-NEXT: v_bfe_u32 v11, v4, 24, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 28, v4 -; GFX9-NEXT: v_bfe_u32 v13, v4, 16, 4 -; GFX9-NEXT: v_bfe_u32 v14, v4, 20, 4 -; GFX9-NEXT: v_bfe_u32 v15, v4, 8, 4 -; GFX9-NEXT: v_bfe_u32 v16, v4, 12, 4 -; GFX9-NEXT: v_bfe_u32 v4, v4, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v10, v2, v10 -; GFX9-NEXT: v_and_b32_e32 v17, v2, v17 -; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v10 -; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v17 -; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, v3 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, v4 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_mul_lo_u16 v3, v3, v4 -; GFX9-NEXT: global_load_ushort v4, v1, s[2:3] -; GFX9-NEXT: v_and_b32_e32 v8, v2, v8 -; GFX9-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX9-NEXT: v_and_b32_e32 v15, v2, v15 -; GFX9-NEXT: v_lshl_or_b32 v8, v9, 16, v8 -; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v5, v16, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v6, v2, v6 -; GFX9-NEXT: v_and_b32_e32 v13, v2, v13 -; GFX9-NEXT: v_and_b32_e32 v2, v2, v11 -; GFX9-NEXT: v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1] -; GFX9-NEXT: v_lshl_or_b32 v6, v7, 16, v6 -; GFX9-NEXT: v_lshl_or_b32 v7, v14, 16, v13 -; GFX9-NEXT: v_lshl_or_b32 v2, v12, 16, v2 -; GFX9-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v0, 12, v0 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] +; GFX9-NEXT: v_and_b32_e32 v18, 15, v2 +; GFX9-NEXT: v_bfe_u32 v5, v1, 24, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 28, v1 +; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4 +; GFX9-NEXT: v_bfe_u32 v8, v1, 20, 4 +; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4 +; GFX9-NEXT: v_bfe_u32 v10, v1, 12, 4 +; GFX9-NEXT: v_bfe_u32 v1, v1, 4, 4 +; GFX9-NEXT: v_bfe_u32 v12, v2, 24, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 28, v2 +; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4 +; GFX9-NEXT: v_bfe_u32 v15, v2, 20, 4 +; GFX9-NEXT: v_bfe_u32 v16, v2, 8, 4 +; GFX9-NEXT: v_bfe_u32 v17, v2, 12, 4 +; GFX9-NEXT: v_bfe_u32 v2, v2, 4, 4 +; GFX9-NEXT: v_and_b32_e32 v11, v4, v11 +; GFX9-NEXT: v_and_b32_e32 v18, v4, v18 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v18 +; GFX9-NEXT: v_and_b32_e32 v9, v4, v9 +; GFX9-NEXT: v_and_b32_e32 v5, v4, v5 +; GFX9-NEXT: v_and_b32_e32 v16, v4, v16 +; GFX9-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_mul_lo_u16 v5, v8, v5 -; GFX9-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] +; GFX9-NEXT: v_lshl_or_b32 v9, v10, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v5, v6, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v6, v17, 16, v16 +; GFX9-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX9-NEXT: v_pk_mul_lo_u16 v2, v6, v7 +; GFX9-NEXT: v_and_b32_e32 v7, v4, v7 +; GFX9-NEXT: v_and_b32_e32 v14, v4, v14 +; GFX9-NEXT: v_and_b32_e32 v4, v4, v12 +; GFX9-NEXT: v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 +; GFX9-NEXT: v_lshl_or_b32 v7, v8, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v15, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v4, v13, 16, v4 +; GFX9-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v4, v3, v4 -; GFX9-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u16_e32 v3, v3, v5 -; GFX9-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u16_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u16_e32 v2, v2, v0 -; GFX9-NEXT: v_add_u16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: global_store_short v1, v0, s[2:3] +; GFX9-NEXT: v_add_u16_e32 v2, v1, v3 +; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, v4 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_mul_lo_u16 v6, v9, v6 +; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] +; GFX9-NEXT: v_add_u16_e32 v1, v1, v6 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, v5, v4 +; GFX9-NEXT: v_pk_mul_lo_u16 v5, v7, v8 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u16_e32 v1, v1, v5 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u16_e32 v1, v1, v4 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc16_vecMul: @@ -2437,77 +2437,77 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v3, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v4, v0, s[6:7] -; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: s_waitcnt vmcnt(2) +; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_and_b32_e32 v10, 15, v3 -; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_and_b32_e32 v17, 15, v4 -; GFX9-DL-NEXT: v_bfe_u32 v0, v3, 24, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v3 -; GFX9-DL-NEXT: v_bfe_u32 v6, v3, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v8, v3, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v9, v3, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v3, v3, 4, 4 -; GFX9-DL-NEXT: v_bfe_u32 v11, v4, 24, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 28, v4 -; GFX9-DL-NEXT: v_bfe_u32 v13, v4, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v4, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v4, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v4, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v4, v4, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v10, v2, v10 -; GFX9-DL-NEXT: v_and_b32_e32 v17, v2, v17 -; GFX9-DL-NEXT: v_lshl_or_b32 v3, v3, 16, v10 -; GFX9-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v17 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, v3 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, v4 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4 -; GFX9-DL-NEXT: global_load_ushort v4, v1, s[2:3] -; GFX9-DL-NEXT: v_and_b32_e32 v8, v2, v8 -; GFX9-DL-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX9-DL-NEXT: v_and_b32_e32 v15, v2, v15 -; GFX9-DL-NEXT: v_lshl_or_b32 v8, v9, 16, v8 -; GFX9-DL-NEXT: v_lshl_or_b32 v0, v5, 16, v0 -; GFX9-DL-NEXT: v_lshl_or_b32 v5, v16, 16, v15 -; GFX9-DL-NEXT: v_and_b32_e32 v6, v2, v6 -; GFX9-DL-NEXT: v_and_b32_e32 v13, v2, v13 -; GFX9-DL-NEXT: v_and_b32_e32 v2, v2, v11 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_lshl_or_b32 v6, v7, 16, v6 -; GFX9-DL-NEXT: v_lshl_or_b32 v7, v14, 16, v13 -; GFX9-DL-NEXT: v_lshl_or_b32 v2, v12, 16, v2 -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v0, 12, v0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_and_b32_e32 v18, 15, v2 +; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 24, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 28, v1 +; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 16, 4 +; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4 +; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 12, 4 +; GFX9-DL-NEXT: v_bfe_u32 v1, v1, 4, 4 +; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 24, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 28, v2 +; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4 +; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 8, 4 +; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 12, 4 +; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 4, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v11, v4, v11 +; GFX9-DL-NEXT: v_and_b32_e32 v18, v4, v18 +; GFX9-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v11 +; GFX9-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v18 +; GFX9-DL-NEXT: v_and_b32_e32 v9, v4, v9 +; GFX9-DL-NEXT: v_and_b32_e32 v5, v4, v5 +; GFX9-DL-NEXT: v_and_b32_e32 v16, v4, v16 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v8, v5 -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_lshl_or_b32 v9, v10, 16, v9 +; GFX9-DL-NEXT: v_lshl_or_b32 v5, v6, 16, v5 +; GFX9-DL-NEXT: v_lshl_or_b32 v6, v17, 16, v16 +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v6, v7 +; GFX9-DL-NEXT: v_and_b32_e32 v7, v4, v7 +; GFX9-DL-NEXT: v_and_b32_e32 v14, v4, v14 +; GFX9-DL-NEXT: v_and_b32_e32 v4, v4, v12 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 +; GFX9-DL-NEXT: v_lshl_or_b32 v7, v8, 16, v7 +; GFX9-DL-NEXT: v_lshl_or_b32 v8, v15, 16, v14 +; GFX9-DL-NEXT: v_lshl_or_b32 v4, v13, 16, v4 +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u16_e32 v4, v3, v4 -; GFX9-DL-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5 -; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v2 -; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v0 -; GFX9-DL-NEXT: v_add_u16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: global_store_short v1, v0, s[2:3] +; GFX9-DL-NEXT: v_add_u16_e32 v2, v1, v3 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, v4 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v6, v9, v6 +; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v7, v8 +; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v5 +; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v4 +; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-XNACK-LABEL: idot8_acc16_vecMul: @@ -2813,90 +2813,90 @@ ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 +; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_mov_b32 s5, 0xffff ; GFX7-NEXT: s_addc_u32 s13, s13, 0 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 28, v2 -; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4 -; GFX7-NEXT: v_bfe_i32 v16, v0, 4, 4 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v9 -; GFX7-NEXT: v_and_b32_e32 v9, s4, v13 -; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v16 -; GFX7-NEXT: buffer_load_ubyte v16, off, s[0:3], 0 -; GFX7-NEXT: v_bfe_i32 v4, v2, 20, 4 -; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 4 -; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4 -; GFX7-NEXT: v_bfe_i32 v7, v2, 8, 4 -; GFX7-NEXT: v_bfe_i32 v8, v2, 4, 4 +; GFX7-NEXT: s_waitcnt vmcnt(2) +; GFX7-NEXT: v_ashrrev_i32_e32 v3, 28, v2 +; GFX7-NEXT: v_bfe_i32 v4, v2, 24, 4 +; GFX7-NEXT: v_bfe_i32 v5, v2, 20, 4 +; GFX7-NEXT: v_bfe_i32 v6, v2, 16, 4 +; GFX7-NEXT: v_bfe_i32 v7, v2, 12, 4 +; GFX7-NEXT: v_bfe_i32 v8, v2, 8, 4 +; GFX7-NEXT: v_bfe_i32 v9, v2, 4, 4 ; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v3 +; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX7-NEXT: v_ashrrev_i32_e32 v10, 28, v0 -; GFX7-NEXT: v_bfe_i32 v11, v0, 24, 4 -; GFX7-NEXT: v_bfe_i32 v12, v0, 20, 4 -; GFX7-NEXT: v_bfe_i32 v14, v0, 12, 4 -; GFX7-NEXT: v_bfe_i32 v15, v0, 8, 4 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_ashrrev_i32_e32 v11, 28, v0 +; GFX7-NEXT: v_bfe_i32 v12, v0, 24, 4 +; GFX7-NEXT: v_bfe_i32 v13, v0, 20, 4 +; GFX7-NEXT: v_bfe_i32 v14, v0, 16, 4 +; GFX7-NEXT: v_bfe_i32 v15, v0, 12, 4 +; GFX7-NEXT: v_bfe_i32 v16, v0, 8, 4 +; GFX7-NEXT: v_bfe_i32 v17, v0, 4, 4 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 4 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v6 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v10 -; GFX7-NEXT: v_and_b32_e32 v7, s4, v11 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v12 -; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v14 -; GFX7-NEXT: v_and_b32_e32 v12, s4, v15 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v10 +; GFX7-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v7 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v11 +; GFX7-NEXT: v_and_b32_e32 v8, s4, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v13 +; GFX7-NEXT: v_and_b32_e32 v10, s4, v14 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v15 +; GFX7-NEXT: v_and_b32_e32 v13, s4, v16 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v17 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v4, s5, v4 -; GFX7-NEXT: v_or_b32_e32 v6, v7, v6 -; GFX7-NEXT: v_or_b32_e32 v7, v9, v8 -; GFX7-NEXT: v_or_b32_e32 v8, v12, v11 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v13 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v5, s5, v5 +; GFX7-NEXT: v_or_b32_e32 v7, v8, v7 +; GFX7-NEXT: v_or_b32_e32 v8, v10, v9 +; GFX7-NEXT: v_or_b32_e32 v9, v13, v12 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v14 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: v_and_b32_e32 v2, s5, v2 -; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX7-NEXT: v_and_b32_e32 v0, s5, v0 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v2 -; GFX7-NEXT: v_and_b32_e32 v12, s4, v0 -; GFX7-NEXT: v_and_b32_e32 v5, s5, v7 -; GFX7-NEXT: v_bfe_u32 v7, v2, 8, 8 -; GFX7-NEXT: v_bfe_u32 v13, v0, 8, 8 ; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; GFX7-NEXT: v_and_b32_e32 v0, s5, v0 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX7-NEXT: v_and_b32_e32 v7, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v13, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v6, s5, v8 +; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 8 +; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 8 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mad_u32_u24 v1, v7, v13, v1 +; GFX7-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 24, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v8, s4, v3 -; GFX7-NEXT: v_and_b32_e32 v14, s4, v4 -; GFX7-NEXT: v_bfe_u32 v9, v3, 8, 8 -; GFX7-NEXT: v_bfe_u32 v15, v4, 8, 8 -; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 -; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: v_and_b32_e32 v10, s4, v10 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v6, v6, v12, v16 -; GFX7-NEXT: v_mad_u32_u24 v6, v7, v13, v6 -; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v6 -; GFX7-NEXT: v_mad_u32_u24 v0, v5, v11, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v8, v14, v0 +; GFX7-NEXT: v_mad_u32_u24 v1, v8, v14, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v9, s4, v4 +; GFX7-NEXT: v_and_b32_e32 v15, s4, v5 +; GFX7-NEXT: v_mad_u32_u24 v0, v6, v12, v0 +; GFX7-NEXT: v_bfe_u32 v10, v4, 8, 8 +; GFX7-NEXT: v_bfe_u32 v16, v5, 8, 8 ; GFX7-NEXT: v_mad_u32_u24 v0, v9, v15, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v3, v4, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v1, v10, v0 +; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 +; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 8 +; GFX7-NEXT: v_mad_u32_u24 v0, v10, v16, v0 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_and_b32_e32 v11, s4, v11 +; GFX7-NEXT: v_mad_u32_u24 v0, v4, v5, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v3, v11, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -70,43 +70,43 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000 ; GFX8-NEXT: s_add_u32 s8, s8, s3 ; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 28, v0 -; GFX8-NEXT: v_bfe_u32 v3, v0, 24, 4 -; GFX8-NEXT: v_bfe_u32 v4, v0, 20, 4 -; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 4 -; GFX8-NEXT: v_bfe_u32 v6, v0, 12, 4 -; GFX8-NEXT: v_bfe_u32 v7, v0, 8, 4 -; GFX8-NEXT: v_bfe_u32 v8, v0, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3 +; GFX8-NEXT: v_bfe_u32 v2, v3, 24, 4 +; GFX8-NEXT: v_bfe_u32 v4, v3, 20, 4 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 4 +; GFX8-NEXT: v_bfe_u32 v6, v3, 12, 4 +; GFX8-NEXT: v_bfe_u32 v7, v3, 8, 4 +; GFX8-NEXT: v_bfe_u32 v8, v3, 4, 4 +; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v1 -; GFX8-NEXT: v_bfe_u32 v10, v1, 24, 4 -; GFX8-NEXT: v_bfe_u32 v11, v1, 20, 4 -; GFX8-NEXT: v_bfe_u32 v12, v1, 16, 4 -; GFX8-NEXT: v_bfe_u32 v13, v1, 12, 4 -; GFX8-NEXT: v_bfe_u32 v14, v1, 8, 4 -; GFX8-NEXT: v_bfe_u32 v15, v1, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v0 +; GFX8-NEXT: v_bfe_u32 v10, v0, 24, 4 +; GFX8-NEXT: v_bfe_u32 v11, v0, 20, 4 +; GFX8-NEXT: v_bfe_u32 v12, v0, 16, 4 +; GFX8-NEXT: v_bfe_u32 v13, v0, 12, 4 +; GFX8-NEXT: v_bfe_u32 v14, v0, 8, 4 +; GFX8-NEXT: v_bfe_u32 v15, v0, 4, 4 +; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v0, v1, s2 +; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s2 ; GFX8-NEXT: v_mad_u32_u24 v0, v8, v15, v0 ; GFX8-NEXT: v_mad_u32_u24 v0, v7, v14, v0 ; GFX8-NEXT: v_mad_u32_u24 v0, v6, v13, v0 ; GFX8-NEXT: v_mad_u32_u24 v0, v5, v12, v0 ; GFX8-NEXT: v_mad_u32_u24 v0, v4, v11, v0 -; GFX8-NEXT: v_mad_u32_u24 v0, v3, v10, v0 -; GFX8-NEXT: v_mad_u32_u24 v2, v2, v9, v0 +; GFX8-NEXT: v_mad_u32_u24 v0, v2, v10, v0 +; GFX8-NEXT: v_mad_u32_u24 v2, v1, v9, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -172,15 +172,15 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot8_u32_u4 v0, v2, v3, s0 -; GFX9-DL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc32: @@ -296,37 +296,37 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: buffer_load_ushort v16, off, s[0:3], 0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2 -; GFX7-NEXT: v_bfe_u32 v3, v2, 24, 4 -; GFX7-NEXT: v_bfe_u32 v4, v2, 20, 4 -; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 4 -; GFX7-NEXT: v_bfe_u32 v6, v2, 12, 4 -; GFX7-NEXT: v_bfe_u32 v7, v2, 8, 4 -; GFX7-NEXT: v_bfe_u32 v8, v2, 4, 4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2 +; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4 +; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4 +; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4 +; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4 +; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4 +; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4 ; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v0 -; GFX7-NEXT: v_bfe_u32 v10, v0, 24, 4 -; GFX7-NEXT: v_bfe_u32 v11, v0, 20, 4 -; GFX7-NEXT: v_bfe_u32 v12, v0, 16, 4 -; GFX7-NEXT: v_bfe_u32 v13, v0, 12, 4 -; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 4 -; GFX7-NEXT: v_bfe_u32 v15, v0, 4, 4 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0 +; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4 +; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4 +; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4 +; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4 +; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4 +; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v16 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v1, v9, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -341,46 +341,46 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: flat_load_ushort v18, v[2:3] +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ushort v4, v[0:1] ; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000 ; GFX8-NEXT: s_add_u32 s8, s8, s3 ; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v4 -; GFX8-NEXT: v_bfe_u32 v5, v4, 24, 4 -; GFX8-NEXT: v_bfe_u32 v6, v4, 20, 4 -; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 4 -; GFX8-NEXT: v_bfe_u32 v8, v4, 12, 4 -; GFX8-NEXT: v_bfe_u32 v9, v4, 8, 4 -; GFX8-NEXT: v_bfe_u32 v10, v4, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 +; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4 +; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4 +; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4 +; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4 +; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4 +; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 28, v0 -; GFX8-NEXT: v_bfe_u32 v12, v0, 24, 4 -; GFX8-NEXT: v_bfe_u32 v13, v0, 20, 4 -; GFX8-NEXT: v_bfe_u32 v14, v0, 16, 4 -; GFX8-NEXT: v_bfe_u32 v15, v0, 12, 4 -; GFX8-NEXT: v_bfe_u32 v16, v0, 8, 4 -; GFX8-NEXT: v_bfe_u32 v17, v0, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2 +; GFX8-NEXT: v_bfe_u32 v13, v2, 24, 4 +; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4 +; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4 +; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4 +; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4 +; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u16 v0, v4, v0, v18 -; GFX8-NEXT: v_mad_u16 v0, v10, v17, v0 -; GFX8-NEXT: v_mad_u16 v0, v9, v16, v0 -; GFX8-NEXT: v_mad_u16 v0, v8, v15, v0 -; GFX8-NEXT: v_mad_u16 v0, v7, v14, v0 -; GFX8-NEXT: v_mad_u16 v0, v6, v13, v0 -; GFX8-NEXT: v_mad_u16 v0, v5, v12, v0 -; GFX8-NEXT: v_mad_u16 v0, v1, v11, v0 -; GFX8-NEXT: flat_store_short v[2:3], v0 +; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 +; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2 +; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2 +; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2 +; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2 +; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2 +; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2 +; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2 +; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udot8_acc16: @@ -393,40 +393,40 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v17, v1, s[2:3] ; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 28, v2 -; GFX9-NEXT: v_bfe_u32 v4, v2, 24, 4 -; GFX9-NEXT: v_bfe_u32 v5, v2, 20, 4 -; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 4 -; GFX9-NEXT: v_bfe_u32 v7, v2, 12, 4 -; GFX9-NEXT: v_bfe_u32 v8, v2, 8, 4 -; GFX9-NEXT: v_bfe_u32 v9, v2, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 +; GFX9-NEXT: v_bfe_u32 v5, v1, 24, 4 +; GFX9-NEXT: v_bfe_u32 v6, v1, 20, 4 +; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4 +; GFX9-NEXT: v_bfe_u32 v8, v1, 12, 4 +; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4 +; GFX9-NEXT: v_bfe_u32 v10, v1, 4, 4 +; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v3 -; GFX9-NEXT: v_bfe_u32 v11, v3, 24, 4 -; GFX9-NEXT: v_bfe_u32 v12, v3, 20, 4 -; GFX9-NEXT: v_bfe_u32 v13, v3, 16, 4 -; GFX9-NEXT: v_bfe_u32 v14, v3, 12, 4 -; GFX9-NEXT: v_bfe_u32 v15, v3, 8, 4 -; GFX9-NEXT: v_bfe_u32 v16, v3, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2 +; GFX9-NEXT: v_bfe_u32 v12, v2, 24, 4 +; GFX9-NEXT: v_bfe_u32 v13, v2, 20, 4 +; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4 +; GFX9-NEXT: v_bfe_u32 v15, v2, 12, 4 +; GFX9-NEXT: v_bfe_u32 v16, v2, 8, 4 +; GFX9-NEXT: v_bfe_u32 v17, v2, 4, 4 +; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_legacy_u16 v2, v2, v3, v17 -; GFX9-NEXT: v_mad_legacy_u16 v2, v9, v16, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v8, v15, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v7, v14, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v6, v13, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v5, v12, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v4, v11, v2 -; GFX9-NEXT: v_mad_legacy_u16 v0, v0, v10, v2 -; GFX9-NEXT: global_store_short v1, v0, s[2:3] +; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 +; GFX9-NEXT: v_mad_legacy_u16 v1, v10, v17, v1 +; GFX9-NEXT: v_mad_legacy_u16 v1, v9, v16, v1 +; GFX9-NEXT: v_mad_legacy_u16 v1, v8, v15, v1 +; GFX9-NEXT: v_mad_legacy_u16 v1, v7, v14, v1 +; GFX9-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 +; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 +; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc16: @@ -439,40 +439,40 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ushort v17, v1, s[2:3] ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 28, v2 -; GFX9-DL-NEXT: v_bfe_u32 v4, v2, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v5, v2, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v6, v2, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v7, v2, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v8, v2, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v9, v2, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 +; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 24, 4 +; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 16, 4 +; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 12, 4 +; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4 +; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 4, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v3 -; GFX9-DL-NEXT: v_bfe_u32 v11, v3, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v12, v3, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v13, v3, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v3, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v3, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v3, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2 +; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 24, 4 +; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4 +; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 12, 4 +; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 8, 4 +; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 4, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v2, v3, v17 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v9, v16, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v8, v15, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v7, v14, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v6, v13, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v5, v12, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v4, v11, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v0, v10, v2 -; GFX9-DL-NEXT: global_store_short v1, v0, s[2:3] +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v10, v17, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v9, v16, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v8, v15, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v7, v14, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc16: @@ -614,37 +614,37 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: buffer_load_ubyte v16, off, s[0:3], 0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2 -; GFX7-NEXT: v_bfe_u32 v3, v2, 24, 4 -; GFX7-NEXT: v_bfe_u32 v4, v2, 20, 4 -; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 4 -; GFX7-NEXT: v_bfe_u32 v6, v2, 12, 4 -; GFX7-NEXT: v_bfe_u32 v7, v2, 8, 4 -; GFX7-NEXT: v_bfe_u32 v8, v2, 4, 4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2 +; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4 +; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4 +; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4 +; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4 +; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4 +; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4 ; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v0 -; GFX7-NEXT: v_bfe_u32 v10, v0, 24, 4 -; GFX7-NEXT: v_bfe_u32 v11, v0, 20, 4 -; GFX7-NEXT: v_bfe_u32 v12, v0, 16, 4 -; GFX7-NEXT: v_bfe_u32 v13, v0, 12, 4 -; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 4 -; GFX7-NEXT: v_bfe_u32 v15, v0, 4, 4 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0 +; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4 +; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4 +; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4 +; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4 +; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4 +; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v16 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v1, v9, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -659,46 +659,46 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: flat_load_ubyte v18, v[2:3] +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ubyte v4, v[0:1] ; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000 ; GFX8-NEXT: s_add_u32 s8, s8, s3 ; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v4 -; GFX8-NEXT: v_bfe_u32 v5, v4, 24, 4 -; GFX8-NEXT: v_bfe_u32 v6, v4, 20, 4 -; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 4 -; GFX8-NEXT: v_bfe_u32 v8, v4, 12, 4 -; GFX8-NEXT: v_bfe_u32 v9, v4, 8, 4 -; GFX8-NEXT: v_bfe_u32 v10, v4, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 +; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4 +; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4 +; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4 +; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4 +; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4 +; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 28, v0 -; GFX8-NEXT: v_bfe_u32 v12, v0, 24, 4 -; GFX8-NEXT: v_bfe_u32 v13, v0, 20, 4 -; GFX8-NEXT: v_bfe_u32 v14, v0, 16, 4 -; GFX8-NEXT: v_bfe_u32 v15, v0, 12, 4 -; GFX8-NEXT: v_bfe_u32 v16, v0, 8, 4 -; GFX8-NEXT: v_bfe_u32 v17, v0, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2 +; GFX8-NEXT: v_bfe_u32 v13, v2, 24, 4 +; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4 +; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4 +; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4 +; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4 +; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u16 v0, v4, v0, v18 -; GFX8-NEXT: v_mad_u16 v0, v10, v17, v0 -; GFX8-NEXT: v_mad_u16 v0, v9, v16, v0 -; GFX8-NEXT: v_mad_u16 v0, v8, v15, v0 -; GFX8-NEXT: v_mad_u16 v0, v7, v14, v0 -; GFX8-NEXT: v_mad_u16 v0, v6, v13, v0 -; GFX8-NEXT: v_mad_u16 v0, v5, v12, v0 -; GFX8-NEXT: v_mad_u16 v0, v1, v11, v0 -; GFX8-NEXT: flat_store_byte v[2:3], v0 +; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 +; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2 +; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2 +; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2 +; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2 +; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2 +; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2 +; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2 +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udot8_acc8: @@ -711,40 +711,40 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-NEXT: global_load_ubyte v17, v1, s[2:3] ; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 28, v2 -; GFX9-NEXT: v_bfe_u32 v4, v2, 24, 4 -; GFX9-NEXT: v_bfe_u32 v5, v2, 20, 4 -; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 4 -; GFX9-NEXT: v_bfe_u32 v7, v2, 12, 4 -; GFX9-NEXT: v_bfe_u32 v8, v2, 8, 4 -; GFX9-NEXT: v_bfe_u32 v9, v2, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 +; GFX9-NEXT: v_bfe_u32 v5, v1, 24, 4 +; GFX9-NEXT: v_bfe_u32 v6, v1, 20, 4 +; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4 +; GFX9-NEXT: v_bfe_u32 v8, v1, 12, 4 +; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4 +; GFX9-NEXT: v_bfe_u32 v10, v1, 4, 4 +; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v3 -; GFX9-NEXT: v_bfe_u32 v11, v3, 24, 4 -; GFX9-NEXT: v_bfe_u32 v12, v3, 20, 4 -; GFX9-NEXT: v_bfe_u32 v13, v3, 16, 4 -; GFX9-NEXT: v_bfe_u32 v14, v3, 12, 4 -; GFX9-NEXT: v_bfe_u32 v15, v3, 8, 4 -; GFX9-NEXT: v_bfe_u32 v16, v3, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2 +; GFX9-NEXT: v_bfe_u32 v12, v2, 24, 4 +; GFX9-NEXT: v_bfe_u32 v13, v2, 20, 4 +; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4 +; GFX9-NEXT: v_bfe_u32 v15, v2, 12, 4 +; GFX9-NEXT: v_bfe_u32 v16, v2, 8, 4 +; GFX9-NEXT: v_bfe_u32 v17, v2, 4, 4 +; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_legacy_u16 v2, v2, v3, v17 -; GFX9-NEXT: v_mad_legacy_u16 v2, v9, v16, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v8, v15, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v7, v14, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v6, v13, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v5, v12, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v4, v11, v2 -; GFX9-NEXT: v_mad_legacy_u16 v0, v0, v10, v2 -; GFX9-NEXT: global_store_byte v1, v0, s[2:3] +; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 +; GFX9-NEXT: v_mad_legacy_u16 v1, v10, v17, v1 +; GFX9-NEXT: v_mad_legacy_u16 v1, v9, v16, v1 +; GFX9-NEXT: v_mad_legacy_u16 v1, v8, v15, v1 +; GFX9-NEXT: v_mad_legacy_u16 v1, v7, v14, v1 +; GFX9-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 +; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 +; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc8: @@ -757,40 +757,40 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ubyte v17, v1, s[2:3] ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 28, v2 -; GFX9-DL-NEXT: v_bfe_u32 v4, v2, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v5, v2, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v6, v2, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v7, v2, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v8, v2, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v9, v2, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 +; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 24, 4 +; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 16, 4 +; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 12, 4 +; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4 +; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 4, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v3 -; GFX9-DL-NEXT: v_bfe_u32 v11, v3, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v12, v3, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v13, v3, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v3, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v3, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v3, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2 +; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 24, 4 +; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4 +; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 12, 4 +; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 8, 4 +; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 4, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v2, v3, v17 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v9, v16, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v8, v15, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v7, v14, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v6, v13, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v5, v12, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v4, v11, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v0, v10, v2 -; GFX9-DL-NEXT: global_store_byte v1, v0, s[2:3] +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v10, v17, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v9, v16, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v8, v15, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v7, v14, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc8: @@ -932,37 +932,37 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: buffer_load_ubyte v16, off, s[0:3], 0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2 -; GFX7-NEXT: v_bfe_u32 v3, v2, 24, 4 -; GFX7-NEXT: v_bfe_u32 v4, v2, 20, 4 -; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 4 -; GFX7-NEXT: v_bfe_u32 v6, v2, 12, 4 -; GFX7-NEXT: v_bfe_u32 v7, v2, 8, 4 -; GFX7-NEXT: v_bfe_u32 v8, v2, 4, 4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2 +; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4 +; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4 +; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4 +; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4 +; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4 +; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4 ; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v0 -; GFX7-NEXT: v_bfe_u32 v10, v0, 24, 4 -; GFX7-NEXT: v_bfe_u32 v11, v0, 20, 4 -; GFX7-NEXT: v_bfe_u32 v12, v0, 16, 4 -; GFX7-NEXT: v_bfe_u32 v13, v0, 12, 4 -; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 4 -; GFX7-NEXT: v_bfe_u32 v15, v0, 4, 4 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0 +; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4 +; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4 +; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4 +; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4 +; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4 +; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v16 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v1, v9, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -978,47 +978,47 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: flat_load_ubyte v18, v[2:3] +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ubyte v4, v[0:1] ; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000 ; GFX8-NEXT: s_add_u32 s8, s8, s3 ; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v4 -; GFX8-NEXT: v_bfe_u32 v6, v4, 20, 4 -; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 4 -; GFX8-NEXT: v_bfe_u32 v8, v4, 12, 4 -; GFX8-NEXT: v_bfe_u32 v9, v4, 8, 4 -; GFX8-NEXT: v_bfe_u32 v10, v4, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3 +; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4 +; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4 +; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4 +; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4 +; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 28, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v0 -; GFX8-NEXT: v_bfe_u32 v13, v0, 20, 4 -; GFX8-NEXT: v_bfe_u32 v14, v0, 16, 4 -; GFX8-NEXT: v_bfe_u32 v15, v0, 12, 4 -; GFX8-NEXT: v_bfe_u32 v16, v0, 8, 4 -; GFX8-NEXT: v_bfe_u32 v17, v0, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v2 +; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4 +; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4 +; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4 +; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4 +; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u16 v0, v4, v0, v18 -; GFX8-NEXT: v_mad_u16 v0, v10, v17, v0 -; GFX8-NEXT: v_mad_u16 v0, v9, v16, v0 -; GFX8-NEXT: v_mad_u16 v0, v8, v15, v0 -; GFX8-NEXT: v_mad_u16 v0, v7, v14, v0 -; GFX8-NEXT: v_mad_u16 v0, v6, v13, v0 -; GFX8-NEXT: v_mad_u16 v0, v5, v12, v0 -; GFX8-NEXT: v_mad_u16 v0, v1, v11, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX8-NEXT: flat_store_byte v[2:3], v0 +; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 +; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2 +; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2 +; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2 +; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2 +; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2 +; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2 +; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udot8_acc4: @@ -1031,41 +1031,41 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-NEXT: global_load_ubyte v17, v1, s[2:3] ; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 28, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v2 -; GFX9-NEXT: v_bfe_u32 v5, v2, 20, 4 -; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 4 -; GFX9-NEXT: v_bfe_u32 v7, v2, 12, 4 -; GFX9-NEXT: v_bfe_u32 v8, v2, 8, 4 -; GFX9-NEXT: v_bfe_u32 v9, v2, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX9-NEXT: v_bfe_u32 v6, v1, 20, 4 +; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4 +; GFX9-NEXT: v_bfe_u32 v8, v1, 12, 4 +; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4 +; GFX9-NEXT: v_bfe_u32 v10, v1, 4, 4 +; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v3 -; GFX9-NEXT: v_bfe_u32 v12, v3, 20, 4 -; GFX9-NEXT: v_bfe_u32 v13, v3, 16, 4 -; GFX9-NEXT: v_bfe_u32 v14, v3, 12, 4 -; GFX9-NEXT: v_bfe_u32 v15, v3, 8, 4 -; GFX9-NEXT: v_bfe_u32 v16, v3, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GFX9-NEXT: v_bfe_u32 v13, v2, 20, 4 +; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4 +; GFX9-NEXT: v_bfe_u32 v15, v2, 12, 4 +; GFX9-NEXT: v_bfe_u32 v16, v2, 8, 4 +; GFX9-NEXT: v_bfe_u32 v17, v2, 4, 4 +; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_legacy_u16 v2, v2, v3, v17 -; GFX9-NEXT: v_mad_legacy_u16 v2, v9, v16, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v8, v15, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v7, v14, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v6, v13, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v5, v12, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v4, v11, v2 -; GFX9-NEXT: v_mad_legacy_u16 v0, v0, v10, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX9-NEXT: global_store_byte v1, v0, s[2:3] +; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 +; GFX9-NEXT: v_mad_legacy_u16 v1, v10, v17, v1 +; GFX9-NEXT: v_mad_legacy_u16 v1, v9, v16, v1 +; GFX9-NEXT: v_mad_legacy_u16 v1, v8, v15, v1 +; GFX9-NEXT: v_mad_legacy_u16 v1, v7, v14, v1 +; GFX9-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 +; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 +; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc4: @@ -1078,41 +1078,41 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ubyte v17, v1, s[2:3] ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 28, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v2 -; GFX9-DL-NEXT: v_bfe_u32 v5, v2, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v6, v2, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v7, v2, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v8, v2, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v9, v2, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 16, 4 +; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 12, 4 +; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4 +; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 4, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v3 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 24, v3 -; GFX9-DL-NEXT: v_bfe_u32 v12, v3, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v13, v3, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v3, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v3, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v3, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4 +; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 12, 4 +; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 8, 4 +; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 4, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v2, v3, v17 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v9, v16, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v8, v15, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v7, v14, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v6, v13, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v5, v12, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v4, v11, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v0, v10, v2 -; GFX9-DL-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX9-DL-NEXT: global_store_byte v1, v0, s[2:3] +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v10, v17, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v9, v16, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v8, v15, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v7, v14, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc4: @@ -1239,37 +1239,37 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: buffer_load_ubyte v16, off, s[0:3], 0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2 -; GFX7-NEXT: v_bfe_u32 v3, v2, 24, 4 -; GFX7-NEXT: v_bfe_u32 v4, v2, 20, 4 -; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 4 -; GFX7-NEXT: v_bfe_u32 v6, v2, 12, 4 -; GFX7-NEXT: v_bfe_u32 v7, v2, 8, 4 -; GFX7-NEXT: v_bfe_u32 v8, v2, 4, 4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2 +; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4 +; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4 +; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4 +; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4 +; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4 +; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4 ; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v0 -; GFX7-NEXT: v_bfe_u32 v10, v0, 24, 4 -; GFX7-NEXT: v_bfe_u32 v11, v0, 20, 4 -; GFX7-NEXT: v_bfe_u32 v12, v0, 16, 4 -; GFX7-NEXT: v_bfe_u32 v13, v0, 12, 4 -; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 4 -; GFX7-NEXT: v_bfe_u32 v15, v0, 4, 4 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0 +; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4 +; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4 +; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4 +; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4 +; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4 +; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v16 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v1, v9, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -1285,47 +1285,47 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: flat_load_ubyte v18, v[2:3] +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ubyte v4, v[0:1] ; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000 ; GFX8-NEXT: s_add_u32 s8, s8, s3 ; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v4 -; GFX8-NEXT: v_bfe_u32 v6, v4, 20, 4 -; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 4 -; GFX8-NEXT: v_bfe_u32 v8, v4, 12, 4 -; GFX8-NEXT: v_bfe_u32 v9, v4, 8, 4 -; GFX8-NEXT: v_bfe_u32 v10, v4, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3 +; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4 +; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4 +; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4 +; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4 +; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 28, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v0 -; GFX8-NEXT: v_bfe_u32 v13, v0, 20, 4 -; GFX8-NEXT: v_bfe_u32 v14, v0, 16, 4 -; GFX8-NEXT: v_bfe_u32 v15, v0, 12, 4 -; GFX8-NEXT: v_bfe_u32 v16, v0, 8, 4 -; GFX8-NEXT: v_bfe_u32 v17, v0, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v2 +; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4 +; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4 +; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4 +; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4 +; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u16 v0, v4, v0, v18 -; GFX8-NEXT: v_mad_u16 v0, v10, v17, v0 -; GFX8-NEXT: v_mad_u16 v0, v9, v16, v0 -; GFX8-NEXT: v_mad_u16 v0, v8, v15, v0 -; GFX8-NEXT: v_mad_u16 v0, v7, v14, v0 -; GFX8-NEXT: v_mad_u16 v0, v6, v13, v0 -; GFX8-NEXT: v_mad_u16 v0, v5, v12, v0 -; GFX8-NEXT: v_mad_u16 v0, v1, v11, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX8-NEXT: flat_store_byte v[2:3], v0 +; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 +; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2 +; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2 +; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2 +; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2 +; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2 +; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2 +; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udot8_CommutationInsideMAD: @@ -1338,41 +1338,41 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-NEXT: global_load_ubyte v17, v1, s[2:3] ; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 28, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v2 -; GFX9-NEXT: v_bfe_u32 v5, v2, 20, 4 -; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 4 -; GFX9-NEXT: v_bfe_u32 v7, v2, 12, 4 -; GFX9-NEXT: v_bfe_u32 v8, v2, 8, 4 -; GFX9-NEXT: v_bfe_u32 v9, v2, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX9-NEXT: v_bfe_u32 v6, v1, 20, 4 +; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4 +; GFX9-NEXT: v_bfe_u32 v8, v1, 12, 4 +; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4 +; GFX9-NEXT: v_bfe_u32 v10, v1, 4, 4 +; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v3 -; GFX9-NEXT: v_bfe_u32 v12, v3, 20, 4 -; GFX9-NEXT: v_bfe_u32 v13, v3, 16, 4 -; GFX9-NEXT: v_bfe_u32 v14, v3, 12, 4 -; GFX9-NEXT: v_bfe_u32 v15, v3, 8, 4 -; GFX9-NEXT: v_bfe_u32 v16, v3, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GFX9-NEXT: v_bfe_u32 v13, v2, 20, 4 +; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4 +; GFX9-NEXT: v_bfe_u32 v15, v2, 12, 4 +; GFX9-NEXT: v_bfe_u32 v16, v2, 8, 4 +; GFX9-NEXT: v_bfe_u32 v17, v2, 4, 4 +; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_legacy_u16 v2, v2, v3, v17 -; GFX9-NEXT: v_mad_legacy_u16 v2, v9, v16, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v8, v15, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v7, v14, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v6, v13, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v5, v12, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v4, v11, v2 -; GFX9-NEXT: v_mad_legacy_u16 v0, v0, v10, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX9-NEXT: global_store_byte v1, v0, s[2:3] +; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 +; GFX9-NEXT: v_mad_legacy_u16 v1, v10, v17, v1 +; GFX9-NEXT: v_mad_legacy_u16 v1, v9, v16, v1 +; GFX9-NEXT: v_mad_legacy_u16 v1, v8, v15, v1 +; GFX9-NEXT: v_mad_legacy_u16 v1, v7, v14, v1 +; GFX9-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 +; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 +; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_CommutationInsideMAD: @@ -1385,41 +1385,41 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ubyte v17, v1, s[2:3] ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 28, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v2 -; GFX9-DL-NEXT: v_bfe_u32 v5, v2, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v6, v2, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v7, v2, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v8, v2, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v9, v2, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 16, 4 +; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 12, 4 +; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4 +; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 4, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v3 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 24, v3 -; GFX9-DL-NEXT: v_bfe_u32 v12, v3, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v13, v3, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v3, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v3, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v3, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4 +; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 12, 4 +; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 8, 4 +; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 4, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v2, v3, v17 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v9, v16, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v8, v15, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v7, v14, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v6, v13, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v5, v12, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v4, v11, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v0, v10, v2 -; GFX9-DL-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX9-DL-NEXT: global_store_byte v1, v0, s[2:3] +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v10, v17, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v9, v16, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v8, v15, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v7, v14, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_CommutationInsideMAD: @@ -1591,44 +1591,44 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000 ; GFX8-NEXT: s_add_u32 s8, s8, s3 ; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 28, v0 -; GFX8-NEXT: v_bfe_u32 v3, v0, 24, 4 -; GFX8-NEXT: v_bfe_u32 v4, v0, 20, 4 -; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 4 -; GFX8-NEXT: v_bfe_u32 v6, v0, 12, 4 -; GFX8-NEXT: v_bfe_u32 v7, v0, 8, 4 -; GFX8-NEXT: v_bfe_u32 v8, v0, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3 +; GFX8-NEXT: v_bfe_u32 v2, v3, 24, 4 +; GFX8-NEXT: v_bfe_u32 v4, v3, 20, 4 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 4 +; GFX8-NEXT: v_bfe_u32 v6, v3, 12, 4 +; GFX8-NEXT: v_bfe_u32 v7, v3, 8, 4 +; GFX8-NEXT: v_bfe_u32 v8, v3, 4, 4 +; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v1 -; GFX8-NEXT: v_bfe_u32 v10, v1, 24, 4 -; GFX8-NEXT: v_bfe_u32 v11, v1, 20, 4 -; GFX8-NEXT: v_bfe_u32 v12, v1, 16, 4 -; GFX8-NEXT: v_bfe_u32 v13, v1, 12, 4 -; GFX8-NEXT: v_bfe_u32 v14, v1, 8, 4 -; GFX8-NEXT: v_bfe_u32 v15, v1, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v0 +; GFX8-NEXT: v_bfe_u32 v10, v0, 24, 4 +; GFX8-NEXT: v_bfe_u32 v11, v0, 20, 4 +; GFX8-NEXT: v_bfe_u32 v12, v0, 16, 4 +; GFX8-NEXT: v_bfe_u32 v13, v0, 12, 4 +; GFX8-NEXT: v_bfe_u32 v14, v0, 8, 4 +; GFX8-NEXT: v_bfe_u32 v15, v0, 4, 4 +; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v16, v0, v1, s2 -; GFX8-NEXT: v_mad_u32_u24 v0, v0, v1, v16 -; GFX8-NEXT: v_mad_u32_u24 v1, v8, v15, v16 -; GFX8-NEXT: v_mad_u32_u24 v1, v7, v14, v1 -; GFX8-NEXT: v_mad_u32_u24 v1, v6, v13, v1 -; GFX8-NEXT: v_mad_u32_u24 v1, v5, v12, v1 -; GFX8-NEXT: v_mad_u32_u24 v1, v4, v11, v1 -; GFX8-NEXT: v_mad_u32_u24 v1, v3, v10, v1 -; GFX8-NEXT: v_mad_u32_u24 v1, v2, v9, v1 +; GFX8-NEXT: v_mad_u32_u24 v16, v3, v0, s2 +; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, v16 +; GFX8-NEXT: v_mad_u32_u24 v3, v8, v15, v16 +; GFX8-NEXT: v_mad_u32_u24 v3, v7, v14, v3 +; GFX8-NEXT: v_mad_u32_u24 v3, v6, v13, v3 +; GFX8-NEXT: v_mad_u32_u24 v3, v5, v12, v3 +; GFX8-NEXT: v_mad_u32_u24 v3, v4, v11, v3 +; GFX8-NEXT: v_mad_u32_u24 v2, v2, v10, v3 +; GFX8-NEXT: v_mad_u32_u24 v1, v1, v9, v2 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1925,43 +1925,43 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000 ; GFX8-NEXT: s_add_u32 s8, s8, s3 ; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 28, v0 -; GFX8-NEXT: v_bfe_u32 v3, v0, 24, 4 -; GFX8-NEXT: v_bfe_u32 v4, v0, 20, 4 -; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 4 -; GFX8-NEXT: v_bfe_u32 v6, v0, 12, 4 -; GFX8-NEXT: v_bfe_u32 v7, v0, 8, 4 -; GFX8-NEXT: v_bfe_u32 v8, v0, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3 +; GFX8-NEXT: v_bfe_u32 v2, v3, 24, 4 +; GFX8-NEXT: v_bfe_u32 v4, v3, 20, 4 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 4 +; GFX8-NEXT: v_bfe_u32 v6, v3, 12, 4 +; GFX8-NEXT: v_bfe_u32 v7, v3, 8, 4 +; GFX8-NEXT: v_bfe_u32 v8, v3, 4, 4 +; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v1 -; GFX8-NEXT: v_bfe_u32 v10, v1, 24, 4 -; GFX8-NEXT: v_bfe_u32 v11, v1, 20, 4 -; GFX8-NEXT: v_bfe_u32 v12, v1, 16, 4 -; GFX8-NEXT: v_bfe_u32 v13, v1, 12, 4 -; GFX8-NEXT: v_bfe_u32 v14, v1, 8, 4 -; GFX8-NEXT: v_bfe_u32 v15, v1, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v0 +; GFX8-NEXT: v_bfe_u32 v10, v0, 24, 4 +; GFX8-NEXT: v_bfe_u32 v11, v0, 20, 4 +; GFX8-NEXT: v_bfe_u32 v12, v0, 16, 4 +; GFX8-NEXT: v_bfe_u32 v13, v0, 12, 4 +; GFX8-NEXT: v_bfe_u32 v14, v0, 8, 4 +; GFX8-NEXT: v_bfe_u32 v15, v0, 4, 4 +; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v0, v1, s2 +; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s2 ; GFX8-NEXT: v_mad_u32_u24 v0, v8, v15, v0 ; GFX8-NEXT: v_mad_u32_u24 v0, v7, v14, v0 ; GFX8-NEXT: v_mad_u32_u24 v0, v6, v13, v0 ; GFX8-NEXT: v_mad_u32_u24 v0, v5, v12, v0 ; GFX8-NEXT: v_mad_u32_u24 v0, v4, v11, v0 -; GFX8-NEXT: v_mad_u32_u24 v0, v3, v10, v0 -; GFX8-NEXT: v_mad_u32_u24 v2, v2, v9, v0 +; GFX8-NEXT: v_mad_u32_u24 v0, v2, v10, v0 +; GFX8-NEXT: v_mad_u32_u24 v2, v1, v9, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -2027,15 +2027,15 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot8_u32_u4 v0, v2, v3, s0 -; GFX9-DL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc32_vecMul: @@ -2118,48 +2118,48 @@ ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: buffer_load_ushort v16, off, s[0:3], 0 +; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_mov_b32 s4, 0xf0000 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_bfe_u32 v7, v2, 20, 4 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 12, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2 -; GFX7-NEXT: v_bfe_u32 v3, v2, 24, 4 -; GFX7-NEXT: v_bfe_u32 v4, v2, 12, 4 -; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 4 -; GFX7-NEXT: v_and_b32_e32 v6, 15, v2 -; GFX7-NEXT: v_alignbit_b32 v2, v7, v2, 16 -; GFX7-NEXT: v_and_b32_e32 v7, s4, v8 +; GFX7-NEXT: v_bfe_u32 v8, v2, 20, 4 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 12, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2 +; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4 +; GFX7-NEXT: v_bfe_u32 v5, v2, 12, 4 +; GFX7-NEXT: v_bfe_u32 v6, v2, 8, 4 +; GFX7-NEXT: v_and_b32_e32 v7, 15, v2 +; GFX7-NEXT: v_alignbit_b32 v2, v8, v2, 16 +; GFX7-NEXT: v_and_b32_e32 v8, s4, v9 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 12, v0 -; GFX7-NEXT: v_and_b32_e32 v13, 15, v0 -; GFX7-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX7-NEXT: v_and_b32_e32 v7, s4, v8 -; GFX7-NEXT: v_or_b32_e32 v7, v13, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GFX7-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 12, v0 +; GFX7-NEXT: v_and_b32_e32 v14, 15, v0 +; GFX7-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX7-NEXT: v_and_b32_e32 v8, s4, v9 +; GFX7-NEXT: v_or_b32_e32 v8, v14, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v7 ; GFX7-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v8 +; GFX7-NEXT: v_and_b32_e32 v8, 15, v8 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v6, v6, v7, v16 -; GFX7-NEXT: v_bfe_u32 v12, v0, 8, 4 -; GFX7-NEXT: v_bfe_u32 v14, v0, 20, 4 -; GFX7-NEXT: v_mad_u32_u24 v6, v8, v13, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v0 -; GFX7-NEXT: v_bfe_u32 v10, v0, 24, 4 -; GFX7-NEXT: v_bfe_u32 v11, v0, 12, 4 -; GFX7-NEXT: v_alignbit_b32 v0, v14, v0, 16 -; GFX7-NEXT: v_mad_u32_u24 v5, v5, v12, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v2 +; GFX7-NEXT: v_mad_u32_u24 v1, v7, v8, v1 +; GFX7-NEXT: v_bfe_u32 v13, v0, 8, 4 +; GFX7-NEXT: v_bfe_u32 v15, v0, 20, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v9, v14, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0 +; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4 +; GFX7-NEXT: v_bfe_u32 v12, v0, 12, 4 +; GFX7-NEXT: v_alignbit_b32 v0, v15, v0, 16 +; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX7-NEXT: v_mad_u32_u24 v4, v4, v11, v5 -; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v4 -; GFX7-NEXT: v_mad_u32_u24 v0, v15, v14, v0 +; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, v16, v15, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v1, v9, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -2174,46 +2174,46 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: flat_load_ushort v18, v[2:3] +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ushort v4, v[0:1] ; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000 ; GFX8-NEXT: s_add_u32 s8, s8, s3 ; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_and_b32_e32 v1, 15, v4 -; GFX8-NEXT: v_bfe_u32 v5, v4, 4, 4 -; GFX8-NEXT: v_bfe_u32 v6, v4, 8, 4 -; GFX8-NEXT: v_bfe_u32 v7, v4, 12, 4 -; GFX8-NEXT: v_bfe_u32 v8, v4, 16, 4 -; GFX8-NEXT: v_bfe_u32 v9, v4, 20, 4 -; GFX8-NEXT: v_bfe_u32 v10, v4, 24, 4 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 28, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 15, v3 +; GFX8-NEXT: v_bfe_u32 v6, v3, 4, 4 +; GFX8-NEXT: v_bfe_u32 v7, v3, 8, 4 +; GFX8-NEXT: v_bfe_u32 v8, v3, 12, 4 +; GFX8-NEXT: v_bfe_u32 v9, v3, 16, 4 +; GFX8-NEXT: v_bfe_u32 v10, v3, 20, 4 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v11, 15, v0 -; GFX8-NEXT: v_bfe_u32 v12, v0, 4, 4 +; GFX8-NEXT: v_and_b32_e32 v12, 15, v2 +; GFX8-NEXT: v_bfe_u32 v13, v2, 4, 4 +; GFX8-NEXT: v_bfe_u32 v14, v2, 8, 4 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u16 v1, v1, v11, v18 -; GFX8-NEXT: v_bfe_u32 v13, v0, 8, 4 -; GFX8-NEXT: v_mad_u16 v1, v5, v12, v1 -; GFX8-NEXT: v_bfe_u32 v14, v0, 12, 4 -; GFX8-NEXT: v_mad_u16 v1, v6, v13, v1 -; GFX8-NEXT: v_bfe_u32 v15, v0, 16, 4 -; GFX8-NEXT: v_mad_u16 v1, v7, v14, v1 -; GFX8-NEXT: v_bfe_u32 v16, v0, 20, 4 -; GFX8-NEXT: v_mad_u16 v1, v8, v15, v1 -; GFX8-NEXT: v_bfe_u32 v17, v0, 24, 4 -; GFX8-NEXT: v_mad_u16 v1, v9, v16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 28, v0 -; GFX8-NEXT: v_mad_u16 v1, v10, v17, v1 -; GFX8-NEXT: v_mad_u16 v0, v4, v0, v1 -; GFX8-NEXT: flat_store_short v[2:3], v0 +; GFX8-NEXT: v_mad_u16 v4, v5, v12, v4 +; GFX8-NEXT: v_mad_u16 v4, v6, v13, v4 +; GFX8-NEXT: v_bfe_u32 v15, v2, 12, 4 +; GFX8-NEXT: v_mad_u16 v4, v7, v14, v4 +; GFX8-NEXT: v_bfe_u32 v16, v2, 16, 4 +; GFX8-NEXT: v_mad_u16 v4, v8, v15, v4 +; GFX8-NEXT: v_bfe_u32 v17, v2, 20, 4 +; GFX8-NEXT: v_mad_u16 v4, v9, v16, v4 +; GFX8-NEXT: v_bfe_u32 v11, v3, 24, 4 +; GFX8-NEXT: v_bfe_u32 v18, v2, 24, 4 +; GFX8-NEXT: v_mad_u16 v4, v10, v17, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 28, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 28, v2 +; GFX8-NEXT: v_mad_u16 v4, v11, v18, v4 +; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 +; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udot8_acc16_vecMul: @@ -2226,61 +2226,61 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[4:5] -; GFX9-NEXT: global_load_dword v4, v0, s[6:7] -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_bfe_u32 v5, v1, 24, 4 +; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4 +; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4 +; GFX9-NEXT: v_and_b32_e32 v11, 15, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_bfe_u32 v0, v3, 24, 4 -; GFX9-NEXT: v_bfe_u32 v6, v3, 16, 4 -; GFX9-NEXT: v_bfe_u32 v8, v3, 8, 4 -; GFX9-NEXT: v_and_b32_e32 v10, 15, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfe_u32 v11, v4, 24, 4 -; GFX9-NEXT: v_bfe_u32 v13, v4, 16, 4 -; GFX9-NEXT: v_bfe_u32 v15, v4, 8, 4 -; GFX9-NEXT: v_and_b32_e32 v17, 15, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v3 -; GFX9-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX9-NEXT: v_bfe_u32 v9, v3, 12, 4 -; GFX9-NEXT: v_bfe_u32 v3, v3, 4, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 28, v4 -; GFX9-NEXT: v_bfe_u32 v14, v4, 20, 4 -; GFX9-NEXT: v_bfe_u32 v16, v4, 12, 4 -; GFX9-NEXT: v_bfe_u32 v4, v4, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v11, v2, v11 -; GFX9-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX9-NEXT: v_and_b32_e32 v13, v2, v13 -; GFX9-NEXT: v_and_b32_e32 v6, v2, v6 -; GFX9-NEXT: v_and_b32_e32 v15, v2, v15 -; GFX9-NEXT: v_and_b32_e32 v8, v2, v8 -; GFX9-NEXT: v_and_b32_e32 v17, v2, v17 -; GFX9-NEXT: v_and_b32_e32 v2, v2, v10 -; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v17 -; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX9-NEXT: v_pk_mul_lo_u16 v2, v2, v4 -; GFX9-NEXT: global_load_ushort v4, v1, s[2:3] -; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v5, v14, 16, v13 -; GFX9-NEXT: v_lshl_or_b32 v6, v7, 16, v6 -; GFX9-NEXT: v_lshl_or_b32 v7, v16, 16, v15 -; GFX9-NEXT: v_lshl_or_b32 v8, v9, 16, v8 -; GFX9-NEXT: v_pk_mul_lo_u16 v3, v6, v5 -; GFX9-NEXT: v_pk_mul_lo_u16 v5, v8, v7 -; GFX9-NEXT: v_lshl_or_b32 v10, v12, 16, v11 -; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v10 +; GFX9-NEXT: v_bfe_u32 v12, v2, 24, 4 +; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4 +; GFX9-NEXT: v_bfe_u32 v16, v2, 8, 4 +; GFX9-NEXT: v_and_b32_e32 v18, 15, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 28, v1 +; GFX9-NEXT: v_bfe_u32 v8, v1, 20, 4 +; GFX9-NEXT: v_bfe_u32 v10, v1, 12, 4 +; GFX9-NEXT: v_bfe_u32 v1, v1, 4, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 28, v2 +; GFX9-NEXT: v_bfe_u32 v15, v2, 20, 4 +; GFX9-NEXT: v_bfe_u32 v17, v2, 12, 4 +; GFX9-NEXT: v_bfe_u32 v2, v2, 4, 4 +; GFX9-NEXT: v_and_b32_e32 v12, v4, v12 +; GFX9-NEXT: v_and_b32_e32 v5, v4, v5 +; GFX9-NEXT: v_and_b32_e32 v14, v4, v14 +; GFX9-NEXT: v_and_b32_e32 v7, v4, v7 +; GFX9-NEXT: v_and_b32_e32 v16, v4, v16 +; GFX9-NEXT: v_and_b32_e32 v9, v4, v9 +; GFX9-NEXT: v_and_b32_e32 v18, v4, v18 +; GFX9-NEXT: v_and_b32_e32 v4, v4, v11 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v4 +; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 +; GFX9-NEXT: v_lshl_or_b32 v11, v13, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v5, v6, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v6, v15, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v7, v8, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v17, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v9, v10, 16, v9 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v4, v2, v4 -; GFX9-NEXT: v_add_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u16_e32 v2, v2, v5 -; GFX9-NEXT: v_add_u16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u16_e32 v2, v2, v3 -; GFX9-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u16_e32 v2, v2, v0 -; GFX9-NEXT: v_add_u16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: global_store_short v1, v0, s[2:3] +; GFX9-NEXT: v_add_u16_e32 v2, v1, v3 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, v5, v11 +; GFX9-NEXT: v_pk_mul_lo_u16 v5, v7, v6 +; GFX9-NEXT: v_pk_mul_lo_u16 v6, v9, v8 +; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u16_e32 v1, v1, v6 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u16_e32 v1, v1, v5 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u16_e32 v1, v1, v4 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc16_vecMul: @@ -2293,61 +2293,61 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v3, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v4, v0, s[6:7] -; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: s_waitcnt vmcnt(2) +; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 24, 4 +; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 16, 4 +; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_bfe_u32 v0, v3, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v6, v3, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v8, v3, 8, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v10, 15, v3 -; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_bfe_u32 v11, v4, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v13, v4, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v4, 8, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v17, 15, v4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v3 -; GFX9-DL-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v9, v3, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v3, v3, 4, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 28, v4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v4, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v4, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v4, v4, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v11, v2, v11 -; GFX9-DL-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX9-DL-NEXT: v_and_b32_e32 v13, v2, v13 -; GFX9-DL-NEXT: v_and_b32_e32 v6, v2, v6 -; GFX9-DL-NEXT: v_and_b32_e32 v15, v2, v15 -; GFX9-DL-NEXT: v_and_b32_e32 v8, v2, v8 -; GFX9-DL-NEXT: v_and_b32_e32 v17, v2, v17 -; GFX9-DL-NEXT: v_and_b32_e32 v2, v2, v10 -; GFX9-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v17 -; GFX9-DL-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v4 -; GFX9-DL-NEXT: global_load_ushort v4, v1, s[2:3] -; GFX9-DL-NEXT: v_lshl_or_b32 v0, v5, 16, v0 -; GFX9-DL-NEXT: v_lshl_or_b32 v5, v14, 16, v13 -; GFX9-DL-NEXT: v_lshl_or_b32 v6, v7, 16, v6 -; GFX9-DL-NEXT: v_lshl_or_b32 v7, v16, 16, v15 -; GFX9-DL-NEXT: v_lshl_or_b32 v8, v9, 16, v8 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v6, v5 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v8, v7 -; GFX9-DL-NEXT: v_lshl_or_b32 v10, v12, 16, v11 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v0, v0, v10 +; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 24, 4 +; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4 +; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 8, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v18, 15, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 28, v1 +; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 12, 4 +; GFX9-DL-NEXT: v_bfe_u32 v1, v1, 4, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 28, v2 +; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 12, 4 +; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 4, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v12, v4, v12 +; GFX9-DL-NEXT: v_and_b32_e32 v5, v4, v5 +; GFX9-DL-NEXT: v_and_b32_e32 v14, v4, v14 +; GFX9-DL-NEXT: v_and_b32_e32 v7, v4, v7 +; GFX9-DL-NEXT: v_and_b32_e32 v16, v4, v16 +; GFX9-DL-NEXT: v_and_b32_e32 v9, v4, v9 +; GFX9-DL-NEXT: v_and_b32_e32 v18, v4, v18 +; GFX9-DL-NEXT: v_and_b32_e32 v4, v4, v11 +; GFX9-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v18 +; GFX9-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v4 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 +; GFX9-DL-NEXT: v_lshl_or_b32 v11, v13, 16, v12 +; GFX9-DL-NEXT: v_lshl_or_b32 v5, v6, 16, v5 +; GFX9-DL-NEXT: v_lshl_or_b32 v6, v15, 16, v14 +; GFX9-DL-NEXT: v_lshl_or_b32 v7, v8, 16, v7 +; GFX9-DL-NEXT: v_lshl_or_b32 v8, v17, 16, v16 +; GFX9-DL-NEXT: v_lshl_or_b32 v9, v10, 16, v9 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u16_e32 v4, v2, v4 -; GFX9-DL-NEXT: v_add_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v5 -; GFX9-DL-NEXT: v_add_u16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v3 -; GFX9-DL-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v0 -; GFX9-DL-NEXT: v_add_u16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: global_store_short v1, v0, s[2:3] +; GFX9-DL-NEXT: v_add_u16_e32 v2, v1, v3 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v11 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v7, v6 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v6, v9, v8 +; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6 +; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v5 +; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v4 +; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc16_vecMul: @@ -2479,75 +2479,75 @@ ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: buffer_load_ubyte v16, off, s[0:3], 0 +; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_movk_i32 s4, 0xf00 ; GFX7-NEXT: s_movk_i32 s5, 0xf0f ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 4, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 12, v2 -; GFX7-NEXT: v_bfe_u32 v1, v2, 8, 4 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 28, v2 -; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 12, v2 +; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 4 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 28, v2 +; GFX7-NEXT: v_bfe_u32 v7, v2, 16, 4 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 4, v2 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v12, 28, v0 -; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 15, v2 -; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 4 -; GFX7-NEXT: v_and_b32_e32 v11, 15, v0 -; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 12, v0 -; GFX7-NEXT: v_alignbit_b32 v2, v5, v2, 24 -; GFX7-NEXT: v_and_b32_e32 v5, s4, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 4, v0 -; GFX7-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX7-NEXT: v_alignbit_b32 v0, v12, v0, 24 -; GFX7-NEXT: v_and_b32_e32 v7, s4, v10 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v14 -; GFX7-NEXT: v_and_b32_e32 v5, s4, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v13, 28, v0 +; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 15, v2 +; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 4 +; GFX7-NEXT: v_and_b32_e32 v12, 15, v0 +; GFX7-NEXT: v_bfe_u32 v14, v0, 16, 4 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 12, v0 +; GFX7-NEXT: v_alignbit_b32 v2, v6, v2, 24 +; GFX7-NEXT: v_and_b32_e32 v6, s4, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 4, v0 +; GFX7-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-NEXT: v_alignbit_b32 v0, v13, v0, 24 +; GFX7-NEXT: v_and_b32_e32 v8, s4, v11 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: v_and_b32_e32 v4, s4, v15 +; GFX7-NEXT: v_and_b32_e32 v6, s4, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v0, s5, v0 -; GFX7-NEXT: v_or_b32_e32 v7, v9, v7 +; GFX7-NEXT: v_or_b32_e32 v8, v10, v8 ; GFX7-NEXT: v_and_b32_e32 v2, s5, v2 -; GFX7-NEXT: v_or_b32_e32 v3, v13, v3 -; GFX7-NEXT: v_or_b32_e32 v5, v11, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX7-NEXT: v_or_b32_e32 v4, v14, v4 +; GFX7-NEXT: v_or_b32_e32 v6, v12, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v4 -; GFX7-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_and_b32_e32 v6, 15, v1 -; GFX7-NEXT: v_and_b32_e32 v12, 15, v3 -; GFX7-NEXT: v_bfe_u32 v7, v1, 8, 4 -; GFX7-NEXT: v_bfe_u32 v13, v3, 8, 4 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: v_and_b32_e32 v7, 15, v3 +; GFX7-NEXT: v_and_b32_e32 v13, 15, v4 +; GFX7-NEXT: v_bfe_u32 v8, v3, 8, 4 +; GFX7-NEXT: v_bfe_u32 v14, v4, 8, 4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v6, v6, v12, v16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1 -; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 4 -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 24, v3 +; GFX7-NEXT: v_mad_u32_u24 v1, v7, v13, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v3 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 4 -; GFX7-NEXT: v_mad_u32_u24 v6, v7, v13, v6 -; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, v6 -; GFX7-NEXT: v_and_b32_e32 v8, 15, v2 -; GFX7-NEXT: v_and_b32_e32 v14, 15, v0 -; GFX7-NEXT: v_mad_u32_u24 v1, v4, v10, v1 -; GFX7-NEXT: v_bfe_u32 v9, v2, 8, 4 -; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 24, v4 +; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 4 ; GFX7-NEXT: v_mad_u32_u24 v1, v8, v14, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v2 +; GFX7-NEXT: v_mad_u32_u24 v1, v3, v4, v1 +; GFX7-NEXT: v_and_b32_e32 v9, 15, v2 +; GFX7-NEXT: v_and_b32_e32 v15, 15, v0 +; GFX7-NEXT: v_mad_u32_u24 v1, v5, v11, v1 +; GFX7-NEXT: v_bfe_u32 v10, v2, 8, 4 +; GFX7-NEXT: v_bfe_u32 v16, v0, 8, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v9, v15, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 4 -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 24, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 4 -; GFX7-NEXT: v_mad_u32_u24 v1, v9, v15, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, v10, v16, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: v_mad_u32_u24 v0, v5, v11, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v6, v12, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -2562,65 +2562,65 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: flat_load_dword v2, v[2:3] +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_ubyte v5, v[0:1] +; GFX8-NEXT: flat_load_ubyte v4, v[0:1] ; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000 ; GFX8-NEXT: s_add_u32 s8, s8, s3 ; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_bfe_u32 v3, v4, 20, 4 -; GFX8-NEXT: v_bfe_u32 v7, v4, 24, 4 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 28, v4 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 4 +; GFX8-NEXT: v_bfe_u32 v6, v3, 20, 4 +; GFX8-NEXT: v_bfe_u32 v7, v3, 24, 4 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 28, v3 +; GFX8-NEXT: v_bfe_u32 v9, v3, 8, 4 +; GFX8-NEXT: v_bfe_u32 v10, v3, 12, 4 ; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_bfe_u32 v12, v2, 16, 4 ; GFX8-NEXT: v_bfe_u32 v13, v2, 20, 4 ; GFX8-NEXT: v_bfe_u32 v14, v2, 24, 4 ; GFX8-NEXT: v_lshrrev_b32_e32 v15, 28, v2 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 4 -; GFX8-NEXT: v_bfe_u32 v9, v4, 8, 4 -; GFX8-NEXT: v_bfe_u32 v10, v4, 12, 4 -; GFX8-NEXT: v_and_b32_e32 v11, 15, v4 -; GFX8-NEXT: v_bfe_u32 v4, v4, 4, 4 -; GFX8-NEXT: v_bfe_u32 v12, v2, 16, 4 +; GFX8-NEXT: v_and_b32_e32 v11, 15, v3 +; GFX8-NEXT: v_bfe_u32 v3, v3, 4, 4 ; GFX8-NEXT: v_bfe_u32 v16, v2, 8, 4 ; GFX8-NEXT: v_bfe_u32 v17, v2, 12, 4 ; GFX8-NEXT: v_and_b32_e32 v18, 15, v2 ; GFX8-NEXT: v_bfe_u32 v2, v2, 4, 4 -; GFX8-NEXT: v_mul_lo_u16_sdwa v3, v3, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mul_lo_u16_e32 v19, v5, v12 +; GFX8-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_mul_lo_u16_e32 v13, v7, v14 ; GFX8-NEXT: v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_mul_lo_u16_e32 v19, v6, v12 ; GFX8-NEXT: v_mul_lo_u16_e32 v9, v9, v16 ; GFX8-NEXT: v_mul_lo_u16_sdwa v10, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mul_lo_u16_sdwa v15, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v3, v19, v6 +; GFX8-NEXT: v_or_b32_e32 v6, v13, v8 +; GFX8-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v6 ; GFX8-NEXT: v_mul_lo_u16_e32 v11, v11, v18 -; GFX8-NEXT: v_mul_lo_u16_sdwa v4, v4, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v8, v13, v8 -; GFX8-NEXT: v_or_b32_e32 v3, v19, v3 -; GFX8-NEXT: v_or_b32_e32 v9, v9, v10 -; GFX8-NEXT: v_or_b32_e32 v10, v11, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v8 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v4, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v9, v11, v15 +; GFX8-NEXT: v_or_b32_e32 v10, v15, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v3 ; GFX8-NEXT: v_lshrrev_b64 v[2:3], 24, v[2:3] -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v10 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u16_e32 v3, v10, v5 -; GFX8-NEXT: v_add_u16_e32 v3, v3, v4 -; GFX8-NEXT: v_add_u16_e32 v3, v3, v9 +; GFX8-NEXT: v_add_u16_e32 v3, v9, v4 +; GFX8-NEXT: v_add_u16_e32 v3, v3, v10 +; GFX8-NEXT: v_add_u16_e32 v3, v3, v8 ; GFX8-NEXT: v_add_u16_e32 v2, v3, v2 -; GFX8-NEXT: v_mad_u16 v2, v6, v12, v2 +; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2 ; GFX8-NEXT: v_add_u16_e32 v2, v2, v11 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v6 ; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2 -; GFX8-NEXT: v_add_u16_e32 v2, v2, v8 +; GFX8-NEXT: v_add_u16_e32 v2, v2, v6 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -2882,37 +2882,37 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: buffer_load_ubyte v16, off, s[0:3], 0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2 -; GFX7-NEXT: v_bfe_u32 v3, v2, 24, 4 -; GFX7-NEXT: v_bfe_u32 v4, v2, 20, 4 -; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 4 -; GFX7-NEXT: v_bfe_u32 v6, v2, 12, 4 -; GFX7-NEXT: v_bfe_u32 v7, v2, 8, 4 -; GFX7-NEXT: v_bfe_u32 v8, v2, 4, 4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2 +; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4 +; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4 +; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4 +; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4 +; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4 +; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4 ; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v0 -; GFX7-NEXT: v_bfe_u32 v10, v0, 24, 4 -; GFX7-NEXT: v_bfe_u32 v11, v0, 20, 4 -; GFX7-NEXT: v_bfe_u32 v12, v0, 16, 4 -; GFX7-NEXT: v_bfe_u32 v13, v0, 12, 4 -; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 4 -; GFX7-NEXT: v_bfe_u32 v15, v0, 4, 4 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0 +; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4 +; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4 +; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4 +; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4 +; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4 +; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v16 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v1, v9, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -2928,55 +2928,55 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ubyte v4, v[0:1] ; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000 ; GFX8-NEXT: s_add_u32 s8, s8, s3 ; GFX8-NEXT: s_addc_u32 s9, s9, 0 +; GFX8-NEXT: s_waitcnt vmcnt(2) +; GFX8-NEXT: v_and_b32_e32 v5, 15, v3 +; GFX8-NEXT: v_bfe_u32 v6, v3, 4, 4 +; GFX8-NEXT: v_bfe_u32 v7, v3, 8, 4 +; GFX8-NEXT: v_bfe_u32 v8, v3, 12, 4 +; GFX8-NEXT: v_bfe_u32 v9, v3, 16, 4 +; GFX8-NEXT: v_bfe_u32 v10, v3, 20, 4 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, 15, v4 -; GFX8-NEXT: v_bfe_u32 v5, v4, 4, 4 -; GFX8-NEXT: v_bfe_u32 v6, v4, 8, 4 -; GFX8-NEXT: v_bfe_u32 v7, v4, 12, 4 -; GFX8-NEXT: v_bfe_u32 v8, v4, 16, 4 -; GFX8-NEXT: v_bfe_u32 v9, v4, 20, 4 -; GFX8-NEXT: v_bfe_u32 v10, v4, 24, 4 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 28, v4 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v11, 15, v0 -; GFX8-NEXT: v_bfe_u32 v12, v0, 4, 4 -; GFX8-NEXT: v_bfe_u32 v13, v0, 8, 4 -; GFX8-NEXT: v_bfe_u32 v14, v0, 12, 4 -; GFX8-NEXT: v_bfe_u32 v15, v0, 16, 4 -; GFX8-NEXT: v_bfe_u32 v16, v0, 20, 4 -; GFX8-NEXT: v_bfe_u32 v17, v0, 24, 4 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 28, v0 -; GFX8-NEXT: v_mul_u32_u24_e32 v0, v4, v0 -; GFX8-NEXT: v_mul_u32_u24_e32 v4, v10, v17 -; GFX8-NEXT: flat_load_ubyte v10, v[2:3] -; GFX8-NEXT: v_mul_u32_u24_e32 v1, v1, v11 +; GFX8-NEXT: v_and_b32_e32 v12, 15, v2 +; GFX8-NEXT: v_bfe_u32 v13, v2, 4, 4 ; GFX8-NEXT: v_mul_u32_u24_e32 v5, v5, v12 +; GFX8-NEXT: v_bfe_u32 v14, v2, 8, 4 ; GFX8-NEXT: v_mul_u32_u24_e32 v6, v6, v13 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v4, v5, v4 +; GFX8-NEXT: v_bfe_u32 v15, v2, 12, 4 ; GFX8-NEXT: v_mul_u32_u24_e32 v7, v7, v14 +; GFX8-NEXT: v_add_u16_e32 v4, v4, v6 +; GFX8-NEXT: v_bfe_u32 v16, v2, 16, 4 ; GFX8-NEXT: v_mul_u32_u24_e32 v8, v8, v15 +; GFX8-NEXT: v_add_u16_e32 v4, v4, v7 +; GFX8-NEXT: v_bfe_u32 v17, v2, 20, 4 ; GFX8-NEXT: v_mul_u32_u24_e32 v9, v9, v16 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u16_e32 v1, v1, v10 -; GFX8-NEXT: v_add_u16_e32 v1, v1, v5 -; GFX8-NEXT: v_add_u16_e32 v1, v1, v6 -; GFX8-NEXT: v_add_u16_e32 v1, v1, v7 -; GFX8-NEXT: v_add_u16_e32 v1, v1, v8 -; GFX8-NEXT: v_add_u16_e32 v1, v1, v9 -; GFX8-NEXT: v_add_u16_e32 v1, v1, v4 -; GFX8-NEXT: v_add_u16_e32 v0, v1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX8-NEXT: flat_store_byte v[2:3], v0 +; GFX8-NEXT: v_add_u16_e32 v4, v4, v8 +; GFX8-NEXT: v_bfe_u32 v11, v3, 24, 4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 28, v3 +; GFX8-NEXT: v_bfe_u32 v18, v2, 24, 4 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 28, v2 +; GFX8-NEXT: v_mul_u32_u24_e32 v10, v10, v17 +; GFX8-NEXT: v_add_u16_e32 v4, v4, v9 +; GFX8-NEXT: v_mul_u32_u24_e32 v2, v3, v2 +; GFX8-NEXT: v_mul_u32_u24_e32 v3, v11, v18 +; GFX8-NEXT: v_add_u16_e32 v4, v4, v10 +; GFX8-NEXT: v_add_u16_e32 v3, v4, v3 +; GFX8-NEXT: v_add_u16_e32 v2, v3, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udot8_acc4_vecMul: @@ -2989,49 +2989,49 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] ; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_and_b32_e32 v4, 15, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_e32 v0, 15, v2 -; GFX9-NEXT: v_bfe_u32 v4, v2, 4, 4 -; GFX9-NEXT: v_bfe_u32 v5, v2, 8, 4 -; GFX9-NEXT: v_bfe_u32 v6, v2, 12, 4 -; GFX9-NEXT: v_bfe_u32 v7, v2, 16, 4 -; GFX9-NEXT: v_bfe_u32 v8, v2, 20, 4 -; GFX9-NEXT: v_bfe_u32 v9, v2, 24, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v10, 15, v3 -; GFX9-NEXT: v_bfe_u32 v11, v3, 4, 4 -; GFX9-NEXT: v_bfe_u32 v12, v3, 8, 4 -; GFX9-NEXT: v_bfe_u32 v13, v3, 12, 4 -; GFX9-NEXT: v_bfe_u32 v14, v3, 16, 4 -; GFX9-NEXT: v_bfe_u32 v15, v3, 20, 4 -; GFX9-NEXT: v_bfe_u32 v16, v3, 24, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 28, v3 -; GFX9-NEXT: v_mul_u32_u24_e32 v2, v2, v3 -; GFX9-NEXT: v_mul_u32_u24_e32 v3, v9, v16 -; GFX9-NEXT: global_load_ubyte v9, v1, s[2:3] -; GFX9-NEXT: v_mul_u32_u24_e32 v0, v0, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 15, v2 +; GFX9-NEXT: v_bfe_u32 v5, v1, 4, 4 +; GFX9-NEXT: v_bfe_u32 v12, v2, 4, 4 ; GFX9-NEXT: v_mul_u32_u24_e32 v4, v4, v11 +; GFX9-NEXT: v_bfe_u32 v6, v1, 8, 4 +; GFX9-NEXT: v_bfe_u32 v13, v2, 8, 4 ; GFX9-NEXT: v_mul_u32_u24_e32 v5, v5, v12 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v3, v4, v3 +; GFX9-NEXT: v_bfe_u32 v7, v1, 12, 4 +; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4 ; GFX9-NEXT: v_mul_u32_u24_e32 v6, v6, v13 +; GFX9-NEXT: v_add_u16_e32 v3, v3, v5 +; GFX9-NEXT: v_bfe_u32 v8, v1, 16, 4 +; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 4 ; GFX9-NEXT: v_mul_u32_u24_e32 v7, v7, v14 +; GFX9-NEXT: v_add_u16_e32 v3, v3, v6 +; GFX9-NEXT: v_bfe_u32 v9, v1, 20, 4 +; GFX9-NEXT: v_bfe_u32 v16, v2, 20, 4 ; GFX9-NEXT: v_mul_u32_u24_e32 v8, v8, v15 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v0, v0, v9 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v4 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v5 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v6 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v7 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v8 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v3 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX9-NEXT: global_store_byte v1, v0, s[2:3] +; GFX9-NEXT: v_add_u16_e32 v3, v3, v7 +; GFX9-NEXT: v_bfe_u32 v10, v1, 24, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 28, v1 +; GFX9-NEXT: v_bfe_u32 v17, v2, 24, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 28, v2 +; GFX9-NEXT: v_mul_u32_u24_e32 v9, v9, v16 +; GFX9-NEXT: v_add_u16_e32 v3, v3, v8 +; GFX9-NEXT: v_mul_u32_u24_e32 v1, v1, v2 +; GFX9-NEXT: v_mul_u32_u24_e32 v2, v10, v17 +; GFX9-NEXT: v_add_u16_e32 v3, v3, v9 +; GFX9-NEXT: v_add_u16_e32 v2, v3, v2 +; GFX9-NEXT: v_add_u16_e32 v1, v2, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc4_vecMul: @@ -3044,49 +3044,49 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-DL-NEXT: s_waitcnt vmcnt(2) +; GFX9-DL-NEXT: v_and_b32_e32 v4, 15, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_and_b32_e32 v0, 15, v2 -; GFX9-DL-NEXT: v_bfe_u32 v4, v2, 4, 4 -; GFX9-DL-NEXT: v_bfe_u32 v5, v2, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v6, v2, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v7, v2, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v8, v2, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v9, v2, 24, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_and_b32_e32 v10, 15, v3 -; GFX9-DL-NEXT: v_bfe_u32 v11, v3, 4, 4 -; GFX9-DL-NEXT: v_bfe_u32 v12, v3, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v13, v3, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v3, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v3, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v3, 24, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, v2, v3 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v3, v9, v16 -; GFX9-DL-NEXT: global_load_ubyte v9, v1, s[2:3] -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v0, v0, v10 +; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v2 +; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 4, 4 +; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 4, 4 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, v4, v11 +; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 8, 4 +; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 8, 4 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, v5, v12 +; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3 +; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 12, 4 +; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 12, 4 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v6, v6, v13 +; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5 +; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 16, 4 +; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 16, 4 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v7, v7, v14 +; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v6 +; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 20, 4 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v8, v8, v15 -; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v9 -; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v4 -; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v5 -; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v6 -; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v7 -; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v8 -; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v3 -; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v2 -; GFX9-DL-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX9-DL-NEXT: global_store_byte v1, v0, s[2:3] +; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v7 +; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 24, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 +; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 24, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v9, v9, v16 +; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v8 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, v10, v17 +; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v9 +; GFX9-DL-NEXT: v_add_u16_e32 v2, v3, v2 +; GFX9-DL-NEXT: v_add_u16_e32 v1, v2, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc4_vecMul: @@ -3231,7 +3231,6 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -3240,6 +3239,7 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, 15, v3 ; GFX8-NEXT: v_bfe_u32 v4, v3, 4, 4 @@ -3321,14 +3321,14 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot8_u32_u4 v0, v3, v2, s0 -; GFX9-DL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_variant1: diff --git a/llvm/test/CodeGen/AMDGPU/imm.ll b/llvm/test/CodeGen/AMDGPU/imm.ll --- a/llvm/test/CodeGen/AMDGPU/imm.ll +++ b/llvm/test/CodeGen/AMDGPU/imm.ll @@ -423,24 +423,24 @@ define amdgpu_kernel void @add_inline_imm_0.0_f32(float addrspace(1)* %out, float %x) { ; SI-LABEL: add_inline_imm_0.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e64 v0, s0, 0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_add_f32_e64 v0, s4, 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_0.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, 0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: v_add_f32_e64 v0, s4, 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 0.0 store float %y, float addrspace(1)* %out @@ -450,24 +450,24 @@ define amdgpu_kernel void @add_inline_imm_0.5_f32(float addrspace(1)* %out, float %x) { ; SI-LABEL: add_inline_imm_0.5_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e64 v0, s0, 0.5 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_add_f32_e64 v0, s4, 0.5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_0.5_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, 0.5 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: v_add_f32_e64 v0, s4, 0.5 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 0.5 store float %y, float addrspace(1)* %out @@ -477,24 +477,24 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f32(float addrspace(1)* %out, float %x) { ; SI-LABEL: add_inline_imm_neg_0.5_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e64 v0, s0, -0.5 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_add_f32_e64 v0, s4, -0.5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_neg_0.5_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, -0.5 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: v_add_f32_e64 v0, s4, -0.5 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, -0.5 store float %y, float addrspace(1)* %out @@ -504,24 +504,24 @@ define amdgpu_kernel void @add_inline_imm_1.0_f32(float addrspace(1)* %out, float %x) { ; SI-LABEL: add_inline_imm_1.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e64 v0, s0, 1.0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_add_f32_e64 v0, s4, 1.0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_1.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, 1.0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: v_add_f32_e64 v0, s4, 1.0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 1.0 store float %y, float addrspace(1)* %out @@ -531,24 +531,24 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f32(float addrspace(1)* %out, float %x) { ; SI-LABEL: add_inline_imm_neg_1.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e64 v0, s0, -1.0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_add_f32_e64 v0, s4, -1.0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_neg_1.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, -1.0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: v_add_f32_e64 v0, s4, -1.0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, -1.0 store float %y, float addrspace(1)* %out @@ -558,24 +558,24 @@ define amdgpu_kernel void @add_inline_imm_2.0_f32(float addrspace(1)* %out, float %x) { ; SI-LABEL: add_inline_imm_2.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e64 v0, s0, 2.0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_add_f32_e64 v0, s4, 2.0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_2.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, 2.0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: v_add_f32_e64 v0, s4, 2.0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 2.0 store float %y, float addrspace(1)* %out @@ -585,24 +585,24 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f32(float addrspace(1)* %out, float %x) { ; SI-LABEL: add_inline_imm_neg_2.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e64 v0, s0, -2.0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_add_f32_e64 v0, s4, -2.0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_neg_2.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, -2.0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: v_add_f32_e64 v0, s4, -2.0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, -2.0 store float %y, float addrspace(1)* %out @@ -612,24 +612,24 @@ define amdgpu_kernel void @add_inline_imm_4.0_f32(float addrspace(1)* %out, float %x) { ; SI-LABEL: add_inline_imm_4.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e64 v0, s0, 4.0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_add_f32_e64 v0, s4, 4.0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_4.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, 4.0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: v_add_f32_e64 v0, s4, 4.0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 4.0 store float %y, float addrspace(1)* %out @@ -639,24 +639,24 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f32(float addrspace(1)* %out, float %x) { ; SI-LABEL: add_inline_imm_neg_4.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e64 v0, s0, -4.0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_add_f32_e64 v0, s4, -4.0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_neg_4.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, -4.0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: v_add_f32_e64 v0, s4, -4.0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, -4.0 store float %y, float addrspace(1)* %out @@ -666,38 +666,38 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f32(float addrspace(1)* %out, float addrspace(1)* %in) { ; SI-LABEL: commute_add_inline_imm_0.5_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v0, 0.5, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: commute_add_inline_imm_0.5_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f32_e32 v0, 0.5, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %x = load float, float addrspace(1)* %in %y = fadd float %x, 0.5 @@ -708,38 +708,38 @@ define amdgpu_kernel void @commute_add_literal_f32(float addrspace(1)* %out, float addrspace(1)* %in) { ; SI-LABEL: commute_add_literal_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v0, 0x44800000, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: commute_add_literal_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f32_e32 v0, 0x44800000, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %x = load float, float addrspace(1)* %in %y = fadd float %x, 1024.0 @@ -750,24 +750,24 @@ define amdgpu_kernel void @add_inline_imm_1_f32(float addrspace(1)* %out, float %x) { ; SI-LABEL: add_inline_imm_1_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e64 v0, s0, 1 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_add_f32_e64 v0, s4, 1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_1_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, 1 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: v_add_f32_e64 v0, s4, 1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 0x36a0000000000000 store float %y, float addrspace(1)* %out @@ -777,24 +777,24 @@ define amdgpu_kernel void @add_inline_imm_2_f32(float addrspace(1)* %out, float %x) { ; SI-LABEL: add_inline_imm_2_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e64 v0, s0, 2 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_add_f32_e64 v0, s4, 2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_2_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, 2 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: v_add_f32_e64 v0, s4, 2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 0x36b0000000000000 store float %y, float addrspace(1)* %out @@ -804,24 +804,24 @@ define amdgpu_kernel void @add_inline_imm_16_f32(float addrspace(1)* %out, float %x) { ; SI-LABEL: add_inline_imm_16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e64 v0, s0, 16 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_add_f32_e64 v0, s4, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_16_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, 16 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: v_add_f32_e64 v0, s4, 16 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 0x36e0000000000000 store float %y, float addrspace(1)* %out @@ -831,26 +831,26 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f32(float addrspace(1)* %out, float %x) { ; SI-LABEL: add_inline_imm_neg_1_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_add_i32 s0, s0, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_add_i32 s4, s4, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_neg_1_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_i32 s0, s0, -1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_add_i32 s4, s4, -1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %xbc = bitcast float %x to i32 %y = add i32 %xbc, -1 @@ -862,26 +862,26 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f32(float addrspace(1)* %out, float %x) { ; SI-LABEL: add_inline_imm_neg_2_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_add_i32 s0, s0, -2 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_add_i32 s4, s4, -2 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_neg_2_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_i32 s0, s0, -2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_add_i32 s4, s4, -2 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %xbc = bitcast float %x to i32 %y = add i32 %xbc, -2 @@ -893,26 +893,26 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f32(float addrspace(1)* %out, float %x) { ; SI-LABEL: add_inline_imm_neg_16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_add_i32 s0, s0, -16 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_add_i32 s4, s4, -16 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_neg_16_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_i32 s0, s0, -16 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_add_i32 s4, s4, -16 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %xbc = bitcast float %x to i32 %y = add i32 %xbc, -16 @@ -924,24 +924,24 @@ define amdgpu_kernel void @add_inline_imm_63_f32(float addrspace(1)* %out, float %x) { ; SI-LABEL: add_inline_imm_63_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e64 v0, s0, 63 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_add_f32_e64 v0, s4, 63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_63_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, 63 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: v_add_f32_e64 v0, s4, 63 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 0x36ff800000000000 store float %y, float addrspace(1)* %out @@ -951,24 +951,24 @@ define amdgpu_kernel void @add_inline_imm_64_f32(float addrspace(1)* %out, float %x) { ; SI-LABEL: add_inline_imm_64_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e64 v0, s0, 64 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_add_f32_e64 v0, s4, 64 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_64_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, 64 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: v_add_f32_e64 v0, s4, 64 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 0x3700000000000000 store float %y, float addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/imm16.ll b/llvm/test/CodeGen/AMDGPU/imm16.ll --- a/llvm/test/CodeGen/AMDGPU/imm16.ll +++ b/llvm/test/CodeGen/AMDGPU/imm16.ll @@ -499,12 +499,12 @@ ; ; VI-LABEL: add_inline_imm_0.0_f16: ; VI: ; %bb.0: +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 ; encoding: [0x02,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 0 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0x00,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 0 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0x00,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; @@ -540,12 +540,12 @@ ; ; VI-LABEL: add_inline_imm_0.5_f16: ; VI: ; %bb.0: +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 ; encoding: [0x02,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 0.5 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xe0,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 0.5 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xe0,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; @@ -581,12 +581,12 @@ ; ; VI-LABEL: add_inline_imm_neg_0.5_f16: ; VI: ; %bb.0: +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 ; encoding: [0x02,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, -0.5 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xe2,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, -0.5 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xe2,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; @@ -622,12 +622,12 @@ ; ; VI-LABEL: add_inline_imm_1.0_f16: ; VI: ; %bb.0: +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 ; encoding: [0x02,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 1.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xe4,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 1.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xe4,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; @@ -663,12 +663,12 @@ ; ; VI-LABEL: add_inline_imm_neg_1.0_f16: ; VI: ; %bb.0: +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 ; encoding: [0x02,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, -1.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xe6,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, -1.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xe6,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; @@ -704,12 +704,12 @@ ; ; VI-LABEL: add_inline_imm_2.0_f16: ; VI: ; %bb.0: +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 ; encoding: [0x02,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 2.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xe8,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 2.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xe8,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; @@ -745,12 +745,12 @@ ; ; VI-LABEL: add_inline_imm_neg_2.0_f16: ; VI: ; %bb.0: +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 ; encoding: [0x02,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, -2.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xea,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, -2.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xea,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; @@ -786,12 +786,12 @@ ; ; VI-LABEL: add_inline_imm_4.0_f16: ; VI: ; %bb.0: +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 ; encoding: [0x02,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 4.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xec,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 4.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xec,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; @@ -827,12 +827,12 @@ ; ; VI-LABEL: add_inline_imm_neg_4.0_f16: ; VI: ; %bb.0: +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 ; encoding: [0x02,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, -4.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xee,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, -4.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xee,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; @@ -874,40 +874,40 @@ ; ; VI-LABEL: commute_add_inline_imm_0.5_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; encoding: [0x02,0x01,0x0a,0xc0,0x00,0x00,0x00,0x00] -; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] -; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0xf0,0x00,0x11] +; VI-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] +; VI-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] +; VI-NEXT: s_mov_b32 s11, s7 ; encoding: [0x07,0x00,0x8b,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: s_mov_b32 s0, s4 ; encoding: [0x04,0x00,0x80,0xbe] -; VI-NEXT: s_mov_b32 s1, s5 ; encoding: [0x05,0x00,0x81,0xbe] -; VI-NEXT: s_mov_b32 s4, s6 ; encoding: [0x06,0x00,0x84,0xbe] -; VI-NEXT: s_mov_b32 s5, s7 ; encoding: [0x07,0x00,0x85,0xbe] -; VI-NEXT: s_mov_b32 s6, s2 ; encoding: [0x02,0x00,0x86,0xbe] -; VI-NEXT: s_mov_b32 s7, s3 ; encoding: [0x03,0x00,0x87,0xbe] -; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x00,0x01,0x80] +; VI-NEXT: s_mov_b32 s8, s2 ; encoding: [0x02,0x00,0x88,0xbe] +; VI-NEXT: s_mov_b32 s9, s3 ; encoding: [0x03,0x00,0x89,0xbe] +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x00,0x02,0x80] +; VI-NEXT: s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe] +; VI-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe] ; VI-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; VI-NEXT: v_add_f16_e32 v0, 0.5, v0 ; encoding: [0xf0,0x00,0x00,0x3e] -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x01,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: commute_add_inline_imm_0.5_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v0, 0.5, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm %x = load half, half addrspace(1)* %in %y = fadd half %x, 0.5 @@ -936,40 +936,40 @@ ; ; VI-LABEL: commute_add_literal_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; encoding: [0x02,0x01,0x0a,0xc0,0x00,0x00,0x00,0x00] -; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] -; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0xf0,0x00,0x11] +; VI-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] +; VI-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] +; VI-NEXT: s_mov_b32 s11, s7 ; encoding: [0x07,0x00,0x8b,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: s_mov_b32 s0, s4 ; encoding: [0x04,0x00,0x80,0xbe] -; VI-NEXT: s_mov_b32 s1, s5 ; encoding: [0x05,0x00,0x81,0xbe] -; VI-NEXT: s_mov_b32 s4, s6 ; encoding: [0x06,0x00,0x84,0xbe] -; VI-NEXT: s_mov_b32 s5, s7 ; encoding: [0x07,0x00,0x85,0xbe] -; VI-NEXT: s_mov_b32 s6, s2 ; encoding: [0x02,0x00,0x86,0xbe] -; VI-NEXT: s_mov_b32 s7, s3 ; encoding: [0x03,0x00,0x87,0xbe] -; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x00,0x01,0x80] +; VI-NEXT: s_mov_b32 s8, s2 ; encoding: [0x02,0x00,0x88,0xbe] +; VI-NEXT: s_mov_b32 s9, s3 ; encoding: [0x03,0x00,0x89,0xbe] +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x00,0x02,0x80] +; VI-NEXT: s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe] +; VI-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe] ; VI-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; VI-NEXT: v_add_f16_e32 v0, 0x6400, v0 ; encoding: [0xff,0x00,0x00,0x3e,0x00,0x64,0x00,0x00] -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x01,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: commute_add_literal_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v0, 0x44800000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm %x = load half, half addrspace(1)* %in %y = fadd half %x, 1024.0 @@ -992,12 +992,12 @@ ; ; VI-LABEL: add_inline_imm_1_f16: ; VI: ; %bb.0: +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 ; encoding: [0x02,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 1 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0x02,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 1 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0x02,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; @@ -1033,12 +1033,12 @@ ; ; VI-LABEL: add_inline_imm_2_f16: ; VI: ; %bb.0: +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 ; encoding: [0x02,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 2 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0x04,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 2 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0x04,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; @@ -1074,12 +1074,12 @@ ; ; VI-LABEL: add_inline_imm_16_f16: ; VI: ; %bb.0: +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 ; encoding: [0x02,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 16 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0x20,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 16 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0x20,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; @@ -1121,38 +1121,38 @@ ; ; VI-LABEL: add_inline_imm_neg_1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; encoding: [0x02,0x01,0x0a,0xc0,0x00,0x00,0x00,0x00] -; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] -; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0xf0,0x00,0x11] +; VI-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] +; VI-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] +; VI-NEXT: s_mov_b32 s11, s7 ; encoding: [0x07,0x00,0x8b,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: s_mov_b32 s0, s4 ; encoding: [0x04,0x00,0x80,0xbe] -; VI-NEXT: s_mov_b32 s1, s5 ; encoding: [0x05,0x00,0x81,0xbe] -; VI-NEXT: s_mov_b32 s4, s6 ; encoding: [0x06,0x00,0x84,0xbe] -; VI-NEXT: s_mov_b32 s5, s7 ; encoding: [0x07,0x00,0x85,0xbe] -; VI-NEXT: s_mov_b32 s6, s2 ; encoding: [0x02,0x00,0x86,0xbe] -; VI-NEXT: s_mov_b32 s7, s3 ; encoding: [0x03,0x00,0x87,0xbe] -; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x00,0x01,0x80] +; VI-NEXT: s_mov_b32 s8, s2 ; encoding: [0x02,0x00,0x88,0xbe] +; VI-NEXT: s_mov_b32 s9, s3 ; encoding: [0x03,0x00,0x89,0xbe] +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x00,0x02,0x80] +; VI-NEXT: s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe] +; VI-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe] ; VI-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; VI-NEXT: v_add_u16_e32 v0, -1, v0 ; encoding: [0xc1,0x00,0x00,0x4c] -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x01,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_neg_1_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v0 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm %x = load i16, i16 addrspace(1)* %in %y = add i16 %x, -1 @@ -1182,38 +1182,38 @@ ; ; VI-LABEL: add_inline_imm_neg_2_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; encoding: [0x02,0x01,0x0a,0xc0,0x00,0x00,0x00,0x00] -; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] -; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0xf0,0x00,0x11] +; VI-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] +; VI-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] +; VI-NEXT: s_mov_b32 s11, s7 ; encoding: [0x07,0x00,0x8b,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: s_mov_b32 s0, s4 ; encoding: [0x04,0x00,0x80,0xbe] -; VI-NEXT: s_mov_b32 s1, s5 ; encoding: [0x05,0x00,0x81,0xbe] -; VI-NEXT: s_mov_b32 s4, s6 ; encoding: [0x06,0x00,0x84,0xbe] -; VI-NEXT: s_mov_b32 s5, s7 ; encoding: [0x07,0x00,0x85,0xbe] -; VI-NEXT: s_mov_b32 s6, s2 ; encoding: [0x02,0x00,0x86,0xbe] -; VI-NEXT: s_mov_b32 s7, s3 ; encoding: [0x03,0x00,0x87,0xbe] -; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x00,0x01,0x80] +; VI-NEXT: s_mov_b32 s8, s2 ; encoding: [0x02,0x00,0x88,0xbe] +; VI-NEXT: s_mov_b32 s9, s3 ; encoding: [0x03,0x00,0x89,0xbe] +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x00,0x02,0x80] +; VI-NEXT: s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe] +; VI-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe] ; VI-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; VI-NEXT: v_add_u16_e32 v0, -2, v0 ; encoding: [0xc2,0x00,0x00,0x4c] -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x01,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_neg_2_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, -2, v0 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm %x = load i16, i16 addrspace(1)* %in %y = add i16 %x, -2 @@ -1243,38 +1243,38 @@ ; ; VI-LABEL: add_inline_imm_neg_16_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; encoding: [0x02,0x01,0x0a,0xc0,0x00,0x00,0x00,0x00] -; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] -; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0xf0,0x00,0x11] +; VI-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] +; VI-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] +; VI-NEXT: s_mov_b32 s11, s7 ; encoding: [0x07,0x00,0x8b,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: s_mov_b32 s0, s4 ; encoding: [0x04,0x00,0x80,0xbe] -; VI-NEXT: s_mov_b32 s1, s5 ; encoding: [0x05,0x00,0x81,0xbe] -; VI-NEXT: s_mov_b32 s4, s6 ; encoding: [0x06,0x00,0x84,0xbe] -; VI-NEXT: s_mov_b32 s5, s7 ; encoding: [0x07,0x00,0x85,0xbe] -; VI-NEXT: s_mov_b32 s6, s2 ; encoding: [0x02,0x00,0x86,0xbe] -; VI-NEXT: s_mov_b32 s7, s3 ; encoding: [0x03,0x00,0x87,0xbe] -; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x00,0x01,0x80] +; VI-NEXT: s_mov_b32 s8, s2 ; encoding: [0x02,0x00,0x88,0xbe] +; VI-NEXT: s_mov_b32 s9, s3 ; encoding: [0x03,0x00,0x89,0xbe] +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x00,0x02,0x80] +; VI-NEXT: s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe] +; VI-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe] ; VI-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; VI-NEXT: v_add_u16_e32 v0, -16, v0 ; encoding: [0xd0,0x00,0x00,0x4c] -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x01,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_neg_16_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, -16, v0 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm %x = load i16, i16 addrspace(1)* %in %y = add i16 %x, -16 @@ -1298,12 +1298,12 @@ ; ; VI-LABEL: add_inline_imm_63_f16: ; VI: ; %bb.0: +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 ; encoding: [0x02,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 63 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0x7e,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 63 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0x7e,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; @@ -1339,12 +1339,12 @@ ; ; VI-LABEL: add_inline_imm_64_f16: ; VI: ; %bb.0: +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 ; encoding: [0x02,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 64 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0x80,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 64 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0x80,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; diff --git a/llvm/test/CodeGen/AMDGPU/immv216.ll b/llvm/test/CodeGen/AMDGPU/immv216.ll --- a/llvm/test/CodeGen/AMDGPU/immv216.ll +++ b/llvm/test/CodeGen/AMDGPU/immv216.ll @@ -143,7 +143,7 @@ ; GFX10: buffer_store_dword [[REG]] ; GFX9: s_load_dword [[VAL:s[0-9]+]] -; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x8f,0xd3,0x04,0xe0,0x01,0x08] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x8f,0xd3,0x06,0xe0,0x01,0x08] ; GFX9: buffer_store_dword [[REG]] ; FIXME: Shouldn't need right shift and SDWA, also extra copy @@ -168,7 +168,7 @@ ; GFX10: buffer_store_dword [[REG]] ; GFX9: s_load_dword [[VAL:s[0-9]+]] -; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -0.5 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x8f,0xd3,0x04,0xe2,0x01,0x08] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -0.5 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x8f,0xd3,0x06,0xe2,0x01,0x08] ; GFX9: buffer_store_dword [[REG]] ; FIXME: Shouldn't need right shift and SDWA, also extra copy @@ -338,7 +338,7 @@ ; GFX9-DAG: buffer_load_dword [[VAL:v[0-9]+]] ; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x6400 ; encoding -; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], [[K]] op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x8f,0xd3,0x00,0x09,0x00,0x08] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], [[K]] op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x8f,0xd3,0x00,0x01,0x00,0x08] ; GFX9: buffer_store_dword [[REG]] ; VI-DAG: s_movk_i32 [[K:s[0-9]+]], 0x6400 ; encoding @@ -423,7 +423,7 @@ } ; GCN-LABEL: {{^}}add_inline_imm_neg_1_v2f16: -; GFX9: s_add_i32 [[VAL:s[0-9]+]], s4, -1 +; GFX9: s_add_i32 [[VAL:s[0-9]+]], s6, -1 ; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]] ; GFX9: buffer_store_dword [[REG]] @@ -440,7 +440,7 @@ } ; GCN-LABEL: {{^}}add_inline_imm_neg_2_v2f16: -; GFX9: s_add_i32 [[VAL:s[0-9]+]], s4, 0xfffefffe +; GFX9: s_add_i32 [[VAL:s[0-9]+]], s6, 0xfffefffe ; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]] ; GFX9: buffer_store_dword [[REG]] @@ -457,7 +457,7 @@ } ; GCN-LABEL: {{^}}add_inline_imm_neg_16_v2f16: -; GFX9: s_add_i32 [[VAL:s[0-9]+]], s4, 0xfff0fff0 +; GFX9: s_add_i32 [[VAL:s[0-9]+]], s6, 0xfff0fff0 ; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]] ; GFX9: buffer_store_dword [[REG]] diff --git a/llvm/test/CodeGen/AMDGPU/insert-subvector-unused-scratch.ll b/llvm/test/CodeGen/AMDGPU/insert-subvector-unused-scratch.ll --- a/llvm/test/CodeGen/AMDGPU/insert-subvector-unused-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-subvector-unused-scratch.ll @@ -15,8 +15,8 @@ } ; GCN-LABEL: store_v5i32: -; GCN: ds_read_b32 ; GCN: ds_read_b128 +; GCN: ds_read_b32 ; GCN: ds_write_b32 ; GCN: ds_write_b128 ; GCN: ScratchSize: 0 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -139,9 +139,9 @@ ; GCN-NOT: buffer_ ; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4 ; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]] -; GCN: s_andn2_b64 ; GCN: s_mov_b32 s[[KLO:[0-9]+]], 0x3c003c00 ; GCN: s_mov_b32 s[[KHI:[0-9]+]], s[[KLO]] +; GCN: s_andn2_b64 ; GCN: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s{{\[}}[[KLO]]:[[KHI]]] ; GCN: s_or_b64 define amdgpu_kernel void @half4_inselt(<4 x half> addrspace(1)* %out, <4 x half> %vec, i32 %sel) { @@ -219,9 +219,9 @@ ; GCN-NOT: buffer_ ; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4 ; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]] -; GCN: s_andn2_b64 ; GCN: s_mov_b32 s[[KLO:[0-9]+]], 0x10001 ; GCN: s_mov_b32 s[[KHI:[0-9]+]], s[[KLO]] +; GCN: s_andn2_b64 ; GCN: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s{{\[}}[[KLO]]:[[KHI]]] ; GCN: s_or_b64 define amdgpu_kernel void @short4_inselt(<4 x i16> addrspace(1)* %out, <4 x i16> %vec, i32 %sel) { @@ -237,8 +237,8 @@ ; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 3 ; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]] ; GCN: s_mov_b32 [[K:s[0-9]+]], 0x1010101 -; GCN: s_and_b32 s3, s1, [[K]] ; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[K]] +; GCN: s_and_b32 s6, s4, [[K]] ; GCN: s_andn2_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] ; GCN: s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] define amdgpu_kernel void @byte8_inselt(<8 x i8> addrspace(1)* %out, <8 x i8> %vec, i32 %sel) { diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -9,25 +9,25 @@ define amdgpu_kernel void @insertelement_v2f32_0(<2 x float> addrspace(1)* %out, <2 x float> %a) nounwind { ; SI-LABEL: insertelement_v2f32_0: ; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: insertelement_v2f32_0: ; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 0 @@ -38,25 +38,25 @@ define amdgpu_kernel void @insertelement_v2f32_1(<2 x float> addrspace(1)* %out, <2 x float> %a) nounwind { ; SI-LABEL: insertelement_v2f32_1: ; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v1, 0x40a00000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: insertelement_v2f32_1: ; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0x40a00000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 1 @@ -67,25 +67,25 @@ define amdgpu_kernel void @insertelement_v2i32_0(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind { ; SI-LABEL: insertelement_v2i32_0: ; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: insertelement_v2i32_0: ; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x3e7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <2 x i32> %a, i32 999, i32 0 @@ -96,25 +96,25 @@ define amdgpu_kernel void @insertelement_v2i32_1(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind { ; SI-LABEL: insertelement_v2i32_1: ; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v1, 0x3e7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: insertelement_v2i32_1: ; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0x3e7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <2 x i32> %a, i32 999, i32 1 @@ -127,32 +127,32 @@ define amdgpu_kernel void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { ; SI-LABEL: insertelement_v4f32_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, 0x40a00000 -; SI-NEXT: s_mov_b32 s3, 0x100f000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s0, 0x40a00000 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: insertelement_v4f32_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s4, 0x40a00000 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s0, 0x40a00000 +; VI-NEXT: s_mov_b32 s7, 0x1100f000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 @@ -162,32 +162,32 @@ define amdgpu_kernel void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { ; SI-LABEL: insertelement_v4f32_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s5, 0x40a00000 -; SI-NEXT: s_mov_b32 s3, 0x100f000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s1, 0x40a00000 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: insertelement_v4f32_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s5, 0x40a00000 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s1, 0x40a00000 +; VI-NEXT: s_mov_b32 s7, 0x1100f000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 @@ -197,32 +197,32 @@ define amdgpu_kernel void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { ; SI-LABEL: insertelement_v4f32_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0x40a00000 -; SI-NEXT: s_mov_b32 s3, 0x100f000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s2, 0x40a00000 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: insertelement_v4f32_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s6, 0x40a00000 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s2, 0x40a00000 +; VI-NEXT: s_mov_b32 s7, 0x1100f000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 @@ -232,32 +232,32 @@ define amdgpu_kernel void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { ; SI-LABEL: insertelement_v4f32_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s7, 0x40a00000 -; SI-NEXT: s_mov_b32 s3, 0x100f000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s3, 0x40a00000 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: insertelement_v4f32_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s7, 0x40a00000 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s3, 0x40a00000 +; VI-NEXT: s_mov_b32 s7, 0x1100f000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 @@ -267,32 +267,32 @@ define amdgpu_kernel void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind { ; SI-LABEL: insertelement_v4i32_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_movk_i32 s4, 0x3e7 -; SI-NEXT: s_mov_b32 s3, 0x100f000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_movk_i32 s0, 0x3e7 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: insertelement_v4i32_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_movk_i32 s4, 0x3e7 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: s_movk_i32 s0, 0x3e7 +; VI-NEXT: s_mov_b32 s7, 0x1100f000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <4 x i32> %a, i32 999, i32 0 store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16 @@ -302,28 +302,28 @@ define amdgpu_kernel void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind { ; SI-LABEL: insertelement_v3f32_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 -; SI-NEXT: s_mov_b32 s3, 0x100f000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v1, 0x40a00000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: insertelement_v3f32_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VI-NEXT: s_mov_b32 s7, 0x1100f000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0x40a00000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 1 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 @@ -333,28 +333,28 @@ define amdgpu_kernel void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind { ; SI-LABEL: insertelement_v3f32_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 -; SI-NEXT: s_mov_b32 s3, 0x100f000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v2, 0x40a00000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: insertelement_v3f32_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VI-NEXT: s_mov_b32 s7, 0x1100f000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v2, 0x40a00000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 2 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 @@ -397,40 +397,40 @@ define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2f32: ; SI: ; %bb.0: +; SI-NEXT: s_load_dword s6, s[4:5], 0x4 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; SI-NEXT: s_load_dword s4, s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: s_cmp_lg_u32 s4, 1 +; SI-NEXT: s_cmp_lg_u32 s6, 1 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_lg_u32 s4, 0 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_cmp_lg_u32 s6, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v2f32: ; VI: ; %bb.0: +; VI-NEXT: s_load_dword s6, s[4:5], 0x10 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: s_cmp_lg_u32 s4, 1 +; VI-NEXT: s_cmp_lg_u32 s6, 1 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_lg_u32 s4, 0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_cmp_lg_u32 s6, 0 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -442,48 +442,48 @@ define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v3f32: ; SI: ; %bb.0: +; SI-NEXT: s_load_dword s8, s[4:5], 0x8 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 -; SI-NEXT: s_load_dword s4, s[4:5], 0x8 +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s10 -; SI-NEXT: s_cmp_lg_u32 s4, 2 +; SI-NEXT: s_cmp_lg_u32 s8, 2 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_lg_u32 s4, 1 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: s_cmp_lg_u32 s8, 1 ; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_lg_u32 s4, 0 +; SI-NEXT: s_cmp_lg_u32 s8, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v3f32: ; VI: ; %bb.0: +; VI-NEXT: s_load_dword s8, s[4:5], 0x20 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 -; VI-NEXT: s_load_dword s4, s[4:5], 0x20 +; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s10 -; VI-NEXT: s_cmp_lg_u32 s4, 2 +; VI-NEXT: s_cmp_lg_u32 s8, 2 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_lg_u32 s4, 1 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: s_cmp_lg_u32 s8, 1 ; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_lg_u32 s4, 0 +; VI-NEXT: s_cmp_lg_u32 s8, 0 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s8 +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -495,56 +495,56 @@ define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v4f32: ; SI: ; %bb.0: +; SI-NEXT: s_load_dword s8, s[4:5], 0x8 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 -; SI-NEXT: s_load_dword s4, s[4:5], 0x8 +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: s_cmp_lg_u32 s4, 3 +; SI-NEXT: s_cmp_lg_u32 s8, 3 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_lg_u32 s4, 2 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_cmp_lg_u32 s8, 2 ; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_mov_b32_e32 v1, s6 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_lg_u32 s4, 1 +; SI-NEXT: s_cmp_lg_u32 s8, 1 ; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_lg_u32 s4, 0 +; SI-NEXT: s_cmp_lg_u32 s8, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v4f32: ; VI: ; %bb.0: +; VI-NEXT: s_load_dword s8, s[4:5], 0x20 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 -; VI-NEXT: s_load_dword s4, s[4:5], 0x20 +; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s11 -; VI-NEXT: s_cmp_lg_u32 s4, 3 +; VI-NEXT: s_cmp_lg_u32 s8, 3 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_lg_u32 s4, 2 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: s_cmp_lg_u32 s8, 2 ; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v1, s10 +; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_lg_u32 s4, 1 +; VI-NEXT: s_cmp_lg_u32 s8, 1 ; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_lg_u32 s4, 0 +; VI-NEXT: s_cmp_lg_u32 s8, 0 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v4, s8 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -556,44 +556,44 @@ define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v8f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dword s6, s[4:5], 0x10 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 -; SI-NEXT: s_load_dword s4, s[4:5], 0x10 ; SI-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s3, 0x100f000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s6, 3 ; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: s_cmp_lg_u32 s4, 3 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_lg_u32 s4, 2 +; SI-NEXT: s_cmp_lg_u32 s6, 2 ; SI-NEXT: v_cndmask_b32_e32 v3, v4, v0, vcc ; SI-NEXT: v_mov_b32_e32 v0, s10 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_lg_u32 s4, 1 +; SI-NEXT: s_cmp_lg_u32 s6, 1 ; SI-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc ; SI-NEXT: v_mov_b32_e32 v0, s9 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_lg_u32 s4, 0 +; SI-NEXT: s_cmp_lg_u32 s6, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, v4, v0, vcc ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_lg_u32 s4, 7 +; SI-NEXT: s_cmp_lg_u32 s6, 7 ; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; SI-NEXT: v_mov_b32_e32 v5, s15 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_lg_u32 s4, 6 +; SI-NEXT: s_cmp_lg_u32 s6, 6 ; SI-NEXT: v_cndmask_b32_e32 v7, v4, v5, vcc ; SI-NEXT: v_mov_b32_e32 v5, s14 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_lg_u32 s4, 5 +; SI-NEXT: s_cmp_lg_u32 s6, 5 ; SI-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc ; SI-NEXT: v_mov_b32_e32 v5, s13 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_lg_u32 s4, 4 +; SI-NEXT: s_cmp_lg_u32 s6, 4 ; SI-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc ; SI-NEXT: v_mov_b32_e32 v8, s12 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 @@ -601,44 +601,44 @@ ; ; VI-LABEL: dynamic_insertelement_v8f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s6, s[4:5], 0x40 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 -; VI-NEXT: s_load_dword s4, s[4:5], 0x40 ; VI-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s6, 3 ; VI-NEXT: v_mov_b32_e32 v0, s11 -; VI-NEXT: s_cmp_lg_u32 s4, 3 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_lg_u32 s4, 2 +; VI-NEXT: s_cmp_lg_u32 s6, 2 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v0, vcc ; VI-NEXT: v_mov_b32_e32 v0, s10 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_lg_u32 s4, 1 +; VI-NEXT: s_cmp_lg_u32 s6, 1 ; VI-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc ; VI-NEXT: v_mov_b32_e32 v0, s9 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_lg_u32 s4, 0 +; VI-NEXT: s_cmp_lg_u32 s6, 0 ; VI-NEXT: v_cndmask_b32_e32 v1, v4, v0, vcc ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_lg_u32 s4, 7 +; VI-NEXT: s_cmp_lg_u32 s6, 7 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; VI-NEXT: v_mov_b32_e32 v5, s15 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_lg_u32 s4, 6 +; VI-NEXT: s_cmp_lg_u32 s6, 6 ; VI-NEXT: v_cndmask_b32_e32 v7, v4, v5, vcc ; VI-NEXT: v_mov_b32_e32 v5, s14 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_lg_u32 s4, 5 +; VI-NEXT: s_cmp_lg_u32 s6, 5 ; VI-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc ; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_lg_u32 s4, 4 +; VI-NEXT: s_cmp_lg_u32 s6, 4 ; VI-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc ; VI-NEXT: v_mov_b32_e32 v8, s12 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 @@ -722,16 +722,16 @@ define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dword s8, s[4:5], 0x4 ; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; SI-NEXT: s_load_dword s4, s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s8, 1 ; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: s_cmp_lg_u32 s4, 1 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_lg_u32 s4, 0 +; SI-NEXT: s_cmp_lg_u32 s8, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc ; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 @@ -741,18 +741,18 @@ ; ; VI-LABEL: dynamic_insertelement_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s8, s[4:5], 0x10 ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s4, 1 -; VI-NEXT: s_cselect_b32 s5, s7, 5 -; VI-NEXT: s_cmp_lg_u32 s4, 0 -; VI-NEXT: s_cselect_b32 s4, s6, 5 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_cmp_lg_u32 s8, 1 +; VI-NEXT: s_cselect_b32 s4, s7, 5 +; VI-NEXT: s_cmp_lg_u32 s8, 0 +; VI-NEXT: s_cselect_b32 s5, s6, 5 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <2 x i32> %a, i32 5, i32 %b @@ -763,45 +763,45 @@ define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v3i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 -; SI-NEXT: s_load_dword s4, s[4:5], 0x8 -; SI-NEXT: s_mov_b32 s3, 0x100f000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dword s8, s[4:5], 0x8 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s10 -; SI-NEXT: s_cmp_lg_u32 s4, 2 +; SI-NEXT: s_cmp_lg_u32 s8, 2 +; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_lg_u32 s4, 1 +; SI-NEXT: s_cmp_lg_u32 s8, 1 ; SI-NEXT: v_cndmask_b32_e32 v2, 5, v0, vcc -; SI-NEXT: v_mov_b32_e32 v0, s9 +; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_lg_u32 s4, 0 +; SI-NEXT: s_cmp_lg_u32 s8, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc -; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v3i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 -; VI-NEXT: s_load_dword s4, s[4:5], 0x20 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dword s8, s[4:5], 0x20 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VI-NEXT: s_mov_b32 s7, 0x1100f000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s4, 2 -; VI-NEXT: s_cselect_b32 s5, s10, 5 -; VI-NEXT: s_cmp_lg_u32 s4, 1 -; VI-NEXT: s_cselect_b32 s6, s9, 5 -; VI-NEXT: s_cmp_lg_u32 s4, 0 -; VI-NEXT: s_cselect_b32 s4, s8, 5 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; VI-NEXT: s_cmp_lg_u32 s8, 2 +; VI-NEXT: s_cselect_b32 s2, s2, 5 +; VI-NEXT: s_cmp_lg_u32 s8, 1 +; VI-NEXT: s_cselect_b32 s1, s1, 5 +; VI-NEXT: s_cmp_lg_u32 s8, 0 +; VI-NEXT: s_cselect_b32 s0, s0, 5 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <3 x i32> %a, i32 5, i32 %b store <3 x i32> %vecins, <3 x i32> addrspace(1)* %out, align 16 @@ -811,17 +811,16 @@ define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind { ; SI-LABEL: dynamic_insertelement_v4i32: ; SI: ; %bb.0: +; SI-NEXT: s_load_dword s6, s[4:5], 0x8 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 -; SI-NEXT: s_load_dword s6, s[4:5], 0x8 ; SI-NEXT: s_load_dword s4, s[4:5], 0x11 ; SI-NEXT: s_mov_b32 s3, 0x100f000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s11 ; SI-NEXT: s_cmp_eq_u32 s6, 3 -; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: v_mov_b32_e32 v0, s11 +; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: s_cmp_eq_u32 s6, 2 ; SI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc ; SI-NEXT: v_mov_b32_e32 v0, s10 @@ -834,32 +833,33 @@ ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 -; VI-NEXT: s_load_dword s6, s[4:5], 0x20 -; VI-NEXT: s_load_dword s4, s[4:5], 0x44 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dword s8, s[4:5], 0x20 +; VI-NEXT: s_load_dword s9, s[4:5], 0x44 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VI-NEXT: s_mov_b32 s7, 0x1100f000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s6, 3 -; VI-NEXT: s_cselect_b32 s5, s4, s11 -; VI-NEXT: s_cmp_eq_u32 s6, 2 -; VI-NEXT: s_cselect_b32 s7, s4, s10 -; VI-NEXT: s_cmp_eq_u32 s6, 1 -; VI-NEXT: s_cselect_b32 s9, s4, s9 -; VI-NEXT: s_cmp_eq_u32 s6, 0 -; VI-NEXT: s_cselect_b32 s4, s4, s8 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: s_cmp_eq_u32 s8, 3 +; VI-NEXT: s_cselect_b32 s3, s9, s3 +; VI-NEXT: s_cmp_eq_u32 s8, 2 +; VI-NEXT: s_cselect_b32 s2, s9, s2 +; VI-NEXT: s_cmp_eq_u32 s8, 1 +; VI-NEXT: s_cselect_b32 s1, s9, s1 +; VI-NEXT: s_cmp_eq_u32 s8, 0 +; VI-NEXT: s_cselect_b32 s0, s9, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16 @@ -869,40 +869,40 @@ define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v8i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dword s6, s[4:5], 0x10 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 -; SI-NEXT: s_load_dword s4, s[4:5], 0x10 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s6, 3 ; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: s_cmp_lg_u32 s4, 3 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_lg_u32 s4, 2 +; SI-NEXT: s_cmp_lg_u32 s6, 2 ; SI-NEXT: v_cndmask_b32_e32 v3, 5, v0, vcc ; SI-NEXT: v_mov_b32_e32 v0, s10 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_lg_u32 s4, 1 +; SI-NEXT: s_cmp_lg_u32 s6, 1 ; SI-NEXT: v_cndmask_b32_e32 v2, 5, v0, vcc ; SI-NEXT: v_mov_b32_e32 v0, s9 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_lg_u32 s4, 0 +; SI-NEXT: s_cmp_lg_u32 s6, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_lg_u32 s4, 7 +; SI-NEXT: s_cmp_lg_u32 s6, 7 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc ; SI-NEXT: v_mov_b32_e32 v4, s15 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_lg_u32 s4, 6 +; SI-NEXT: s_cmp_lg_u32 s6, 6 ; SI-NEXT: v_cndmask_b32_e32 v7, 5, v4, vcc ; SI-NEXT: v_mov_b32_e32 v4, s14 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_lg_u32 s4, 5 +; SI-NEXT: s_cmp_lg_u32 s6, 5 ; SI-NEXT: v_cndmask_b32_e32 v6, 5, v4, vcc ; SI-NEXT: v_mov_b32_e32 v4, s13 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_lg_u32 s4, 4 +; SI-NEXT: s_cmp_lg_u32 s6, 4 ; SI-NEXT: v_cndmask_b32_e32 v5, 5, v4, vcc ; SI-NEXT: v_mov_b32_e32 v4, s12 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 @@ -913,29 +913,29 @@ ; ; VI-LABEL: dynamic_insertelement_v8i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s6, s[4:5], 0x40 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 -; VI-NEXT: s_load_dword s4, s[4:5], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s4, 3 -; VI-NEXT: s_cselect_b32 s5, s11, 5 -; VI-NEXT: s_cmp_lg_u32 s4, 2 -; VI-NEXT: s_cselect_b32 s6, s10, 5 -; VI-NEXT: s_cmp_lg_u32 s4, 1 +; VI-NEXT: s_cmp_lg_u32 s6, 3 +; VI-NEXT: s_cselect_b32 s4, s11, 5 +; VI-NEXT: s_cmp_lg_u32 s6, 2 +; VI-NEXT: s_cselect_b32 s5, s10, 5 +; VI-NEXT: s_cmp_lg_u32 s6, 1 ; VI-NEXT: s_cselect_b32 s7, s9, 5 -; VI-NEXT: s_cmp_lg_u32 s4, 0 +; VI-NEXT: s_cmp_lg_u32 s6, 0 ; VI-NEXT: s_cselect_b32 s8, s8, 5 -; VI-NEXT: s_cmp_lg_u32 s4, 7 +; VI-NEXT: s_cmp_lg_u32 s6, 7 ; VI-NEXT: s_cselect_b32 s9, s15, 5 -; VI-NEXT: s_cmp_lg_u32 s4, 6 +; VI-NEXT: s_cmp_lg_u32 s6, 6 ; VI-NEXT: s_cselect_b32 s10, s14, 5 -; VI-NEXT: s_cmp_lg_u32 s4, 5 +; VI-NEXT: s_cmp_lg_u32 s6, 5 ; VI-NEXT: s_cselect_b32 s11, s13, 5 -; VI-NEXT: s_cmp_lg_u32 s4, 4 -; VI-NEXT: s_cselect_b32 s4, s12, 5 -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_cmp_lg_u32 s6, 4 +; VI-NEXT: s_cselect_b32 s6, s12, 5 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s11 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s9 @@ -943,8 +943,8 @@ ; VI-NEXT: s_nop 0 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <8 x i32> %a, i32 5, i32 %b @@ -955,9 +955,9 @@ define amdgpu_kernel void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v16i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10 -; SI-NEXT: s_load_dword s4, s[4:5], 0x20 +; SI-NEXT: s_load_dword s6, s[4:5], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -977,7 +977,7 @@ ; SI-NEXT: v_mov_b32_e32 v13, s21 ; SI-NEXT: v_mov_b32_e32 v14, s22 ; SI-NEXT: v_mov_b32_e32 v15, s23 -; SI-NEXT: s_mov_b32 m0, s4 +; SI-NEXT: s_mov_b32 m0, s6 ; SI-NEXT: v_movreld_b32_e32 v0, 5 ; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 @@ -987,9 +987,9 @@ ; ; VI-LABEL: dynamic_insertelement_v16i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; VI-NEXT: s_load_dword s4, s[4:5], 0x80 +; VI-NEXT: s_load_dword s6, s[4:5], 0x80 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1009,7 +1009,7 @@ ; VI-NEXT: v_mov_b32_e32 v13, s21 ; VI-NEXT: v_mov_b32_e32 v14, s22 ; VI-NEXT: v_mov_b32_e32 v15, s23 -; VI-NEXT: s_mov_b32 m0, s4 +; VI-NEXT: s_mov_b32 m0, s6 ; VI-NEXT: v_movreld_b32_e32 v0, 5 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 @@ -1024,34 +1024,34 @@ define amdgpu_kernel void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2i16: ; SI: ; %bb.0: +; SI-NEXT: s_load_dword s6, s[4:5], 0x3 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dword s6, s[4:5], 0x2 -; SI-NEXT: s_load_dword s4, s[4:5], 0x3 +; SI-NEXT: s_load_dword s4, s[4:5], 0x2 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s4, s4, 4 -; SI-NEXT: s_lshl_b32 s4, 0xffff, s4 -; SI-NEXT: s_andn2_b32 s5, s6, s4 -; SI-NEXT: s_and_b32 s4, s4, 0x50005 -; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s5, s6, 4 +; SI-NEXT: s_lshl_b32 s5, 0xffff, s5 +; SI-NEXT: s_andn2_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s5, 0x50005 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v2i16: ; VI: ; %bb.0: +; VI-NEXT: s_load_dword s6, s[4:5], 0xc ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dword s6, s[4:5], 0x8 -; VI-NEXT: s_load_dword s4, s[4:5], 0xc +; VI-NEXT: s_load_dword s4, s[4:5], 0x8 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s4, s4, 4 -; VI-NEXT: s_lshl_b32 s4, 0xffff, s4 -; VI-NEXT: s_andn2_b32 s5, s6, s4 -; VI-NEXT: s_and_b32 s4, s4, 0x50005 -; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_lshl_b32 s5, s6, 4 +; VI-NEXT: s_lshl_b32 s5, 0xffff, s5 +; VI-NEXT: s_andn2_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s5, 0x50005 +; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -1063,19 +1063,19 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v3i16: ; SI: ; %bb.0: +; SI-NEXT: s_load_dword s6, s[4:5], 0x4 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; SI-NEXT: s_load_dword s4, s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s8, s4, 4 -; SI-NEXT: s_mov_b64 s[4:5], 0xffff -; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], s8 +; SI-NEXT: s_lshl_b32 s8, s6, 4 +; SI-NEXT: s_mov_b64 s[6:7], 0xffff +; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 ; SI-NEXT: s_mov_b32 s8, 0x50005 -; SI-NEXT: s_and_b32 s9, s5, s8 -; SI-NEXT: s_and_b32 s8, s4, s8 -; SI-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] +; SI-NEXT: s_and_b32 s9, s7, s8 +; SI-NEXT: s_and_b32 s8, s6, s8 +; SI-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] ; SI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] ; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 @@ -1085,20 +1085,20 @@ ; ; VI-LABEL: dynamic_insertelement_v3i16: ; VI: ; %bb.0: +; VI-NEXT: s_load_dword s6, s[4:5], 0x10 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s8, s4, 4 -; VI-NEXT: s_mov_b64 s[4:5], 0xffff -; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s8 +; VI-NEXT: s_lshl_b32 s8, s6, 4 +; VI-NEXT: s_mov_b64 s[6:7], 0xffff +; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 ; VI-NEXT: s_mov_b32 s8, 0x50005 ; VI-NEXT: s_mov_b32 s9, s8 -; VI-NEXT: s_andn2_b64 s[6:7], s[6:7], s[4:5] -; VI-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9] -; VI-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; VI-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; VI-NEXT: s_and_b64 s[6:7], s[6:7], s[8:9] +; VI-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, s5 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -1112,33 +1112,33 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2i8: ; SI: ; %bb.0: +; SI-NEXT: s_load_dword s6, s[4:5], 0x13 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dword s6, s[4:5], 0xa -; SI-NEXT: s_load_dword s4, s[4:5], 0x13 +; SI-NEXT: s_load_dword s4, s[4:5], 0xa ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s4, s4, 3 -; SI-NEXT: s_lshl_b32 s4, -1, s4 -; SI-NEXT: s_andn2_b32 s5, s6, s4 -; SI-NEXT: s_and_b32 s4, s4, 0x505 -; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s5, s6, 3 +; SI-NEXT: s_lshl_b32 s5, -1, s5 +; SI-NEXT: s_andn2_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s5, 0x505 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v2i8: ; VI: ; %bb.0: +; VI-NEXT: s_load_dword s6, s[4:5], 0x4c ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dword s6, s[4:5], 0x28 -; VI-NEXT: s_load_dword s4, s[4:5], 0x4c +; VI-NEXT: s_load_dword s4, s[4:5], 0x28 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s4, s4, 3 -; VI-NEXT: v_lshlrev_b16_e64 v0, s4, -1 +; VI-NEXT: s_lshl_b32 s5, s6, 3 +; VI-NEXT: v_lshlrev_b16_e64 v0, s5, -1 ; VI-NEXT: v_not_b32_e32 v1, v0 -; VI-NEXT: v_and_b32_e32 v1, s6, v1 +; VI-NEXT: v_and_b32_e32 v1, s4, v1 ; VI-NEXT: v_and_b32_e32 v0, 0x505, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -1153,17 +1153,17 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v3i8: ; SI: ; %bb.0: +; SI-NEXT: s_load_dword s6, s[4:5], 0x13 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dword s6, s[4:5], 0xa -; SI-NEXT: s_load_dword s4, s[4:5], 0x13 +; SI-NEXT: s_load_dword s4, s[4:5], 0xa ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s4, s4, 3 -; SI-NEXT: s_lshl_b32 s4, 0xffff, s4 -; SI-NEXT: s_andn2_b32 s5, s6, s4 -; SI-NEXT: s_and_b32 s4, s4, 0x5050505 -; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s5, s6, 3 +; SI-NEXT: s_lshl_b32 s5, 0xffff, s5 +; SI-NEXT: s_andn2_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s5, 0x5050505 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_lshr_b32 s5, s4, 16 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -1173,17 +1173,17 @@ ; ; VI-LABEL: dynamic_insertelement_v3i8: ; VI: ; %bb.0: +; VI-NEXT: s_load_dword s6, s[4:5], 0x4c ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dword s6, s[4:5], 0x28 -; VI-NEXT: s_load_dword s4, s[4:5], 0x4c +; VI-NEXT: s_load_dword s4, s[4:5], 0x28 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s4, s4, 3 -; VI-NEXT: s_lshl_b32 s4, 0xffff, s4 -; VI-NEXT: s_andn2_b32 s5, s6, s4 -; VI-NEXT: s_and_b32 s4, s4, 0x5050505 -; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_lshl_b32 s5, s6, 3 +; VI-NEXT: s_lshl_b32 s5, 0xffff, s5 +; VI-NEXT: s_andn2_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s5, 0x5050505 +; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_lshr_b32 s5, s4, 16 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -1198,34 +1198,34 @@ define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v4i8: ; SI: ; %bb.0: +; SI-NEXT: s_load_dword s6, s[4:5], 0x13 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dword s6, s[4:5], 0xa -; SI-NEXT: s_load_dword s4, s[4:5], 0x13 +; SI-NEXT: s_load_dword s4, s[4:5], 0xa ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s4, s4, 3 -; SI-NEXT: s_lshl_b32 s4, 0xffff, s4 -; SI-NEXT: s_andn2_b32 s5, s6, s4 -; SI-NEXT: s_and_b32 s4, s4, 0x5050505 -; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s5, s6, 3 +; SI-NEXT: s_lshl_b32 s5, 0xffff, s5 +; SI-NEXT: s_andn2_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s5, 0x5050505 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v4i8: ; VI: ; %bb.0: +; VI-NEXT: s_load_dword s6, s[4:5], 0x4c ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dword s6, s[4:5], 0x28 -; VI-NEXT: s_load_dword s4, s[4:5], 0x4c +; VI-NEXT: s_load_dword s4, s[4:5], 0x28 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s4, s4, 3 -; VI-NEXT: s_lshl_b32 s4, 0xffff, s4 -; VI-NEXT: s_andn2_b32 s5, s6, s4 -; VI-NEXT: s_and_b32 s4, s4, 0x5050505 -; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_lshl_b32 s5, s6, 3 +; VI-NEXT: s_lshl_b32 s5, 0xffff, s5 +; VI-NEXT: s_andn2_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s5, 0x5050505 +; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -1237,50 +1237,50 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(4)* %a.ptr, i32 %b) nounwind { ; SI-LABEL: s_dynamic_insertelement_v8i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 -; SI-NEXT: s_load_dword s6, s[4:5], 0x4 -; SI-NEXT: s_mov_b32 s3, 0x100f000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dword s8, s[4:5], 0x4 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dwordx2 s[4:5], s[10:11], 0x0 -; SI-NEXT: s_mov_b32 s0, s8 -; SI-NEXT: s_lshl_b32 s8, s6, 3 -; SI-NEXT: s_mov_b64 s[6:7], 0xffff -; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SI-NEXT: s_lshl_b32 s8, s8, 3 +; SI-NEXT: s_mov_b64 s[2:3], 0xffff +; SI-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 ; SI-NEXT: s_mov_b32 s8, 0x5050505 -; SI-NEXT: s_mov_b32 s1, s9 -; SI-NEXT: s_and_b32 s9, s7, s8 -; SI-NEXT: s_and_b32 s8, s6, s8 +; SI-NEXT: s_and_b32 s9, s3, s8 +; SI-NEXT: s_and_b32 s8, s2, s8 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] -; SI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; SI-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_dynamic_insertelement_v8i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 -; VI-NEXT: s_load_dword s6, s[4:5], 0x10 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s8, s[4:5], 0x10 +; VI-NEXT: s_mov_b32 s7, 0x1100f000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[4:5], s[10:11], 0x0 -; VI-NEXT: s_mov_b32 s0, s8 -; VI-NEXT: s_lshl_b32 s8, s6, 3 -; VI-NEXT: s_mov_b64 s[6:7], 0xffff -; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; VI-NEXT: s_lshl_b32 s8, s8, 3 +; VI-NEXT: s_mov_b64 s[2:3], 0xffff +; VI-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 ; VI-NEXT: s_mov_b32 s8, 0x5050505 -; VI-NEXT: s_mov_b32 s1, s9 -; VI-NEXT: s_and_b32 s9, s7, s8 -; VI-NEXT: s_and_b32 s8, s6, s8 +; VI-NEXT: s_and_b32 s9, s3, s8 +; VI-NEXT: s_and_b32 s8, s2, s8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] -; VI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; VI-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm %a = load <8 x i8>, <8 x i8> addrspace(4)* %a.ptr, align 4 %vecins = insertelement <8 x i8> %a, i8 5, i32 %b @@ -1291,226 +1291,226 @@ define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v16i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 -; SI-NEXT: s_load_dword s4, s[4:5], 0x8 +; SI-NEXT: s_load_dword s6, s[4:5], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s5, s11, 24 -; SI-NEXT: s_cmp_lg_u32 s4, 15 -; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: s_lshr_b32 s4, s11, 24 +; SI-NEXT: s_cmp_lg_u32 s6, 15 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_lshr_b32 s5, s11, 16 -; SI-NEXT: s_cmp_lg_u32 s4, 14 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: s_cmp_lg_u32 s6, 14 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; SI-NEXT: s_movk_i32 s5, 0xff -; SI-NEXT: s_lshr_b32 s6, s11, 8 +; SI-NEXT: s_movk_i32 s4, 0xff +; SI-NEXT: s_lshr_b32 s5, s11, 8 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: v_and_b32_e32 v1, s5, v1 -; SI-NEXT: s_cmp_lg_u32 s4, 13 +; SI-NEXT: v_and_b32_e32 v1, s4, v1 +; SI-NEXT: s_cmp_lg_u32 s6, 13 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_lg_u32 s4, 12 +; SI-NEXT: s_cmp_lg_u32 s6, 12 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc ; SI-NEXT: v_mov_b32_e32 v2, s11 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; SI-NEXT: v_and_b32_e32 v2, s5, v2 +; SI-NEXT: v_and_b32_e32 v2, s4, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: s_mov_b32 s6, 0xffff +; SI-NEXT: s_mov_b32 s5, 0xffff ; SI-NEXT: s_lshr_b32 s7, s10, 24 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v1, s6, v1 -; SI-NEXT: s_cmp_lg_u32 s4, 11 +; SI-NEXT: v_and_b32_e32 v1, s5, v1 +; SI-NEXT: s_cmp_lg_u32 s6, 11 ; SI-NEXT: v_or_b32_e32 v3, v1, v0 ; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_lshr_b32 s7, s10, 16 -; SI-NEXT: s_cmp_lg_u32 s4, 10 +; SI-NEXT: s_cmp_lg_u32 s6, 10 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc ; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc ; SI-NEXT: s_lshr_b32 s7, s10, 8 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: v_and_b32_e32 v1, s5, v1 -; SI-NEXT: s_cmp_lg_u32 s4, 9 +; SI-NEXT: v_and_b32_e32 v1, s4, v1 +; SI-NEXT: s_cmp_lg_u32 s6, 9 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_lg_u32 s4, 8 +; SI-NEXT: s_cmp_lg_u32 s6, 8 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; SI-NEXT: v_and_b32_e32 v2, s5, v2 +; SI-NEXT: v_and_b32_e32 v2, s4, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: s_lshr_b32 s7, s9, 24 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v1, s6, v1 -; SI-NEXT: s_cmp_lg_u32 s4, 7 +; SI-NEXT: v_and_b32_e32 v1, s5, v1 +; SI-NEXT: s_cmp_lg_u32 s6, 7 ; SI-NEXT: v_or_b32_e32 v2, v1, v0 ; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_lshr_b32 s7, s9, 16 -; SI-NEXT: s_cmp_lg_u32 s4, 6 +; SI-NEXT: s_cmp_lg_u32 s6, 6 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc ; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc ; SI-NEXT: s_lshr_b32 s7, s9, 8 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: v_and_b32_e32 v1, s5, v1 -; SI-NEXT: s_cmp_lg_u32 s4, 5 +; SI-NEXT: v_and_b32_e32 v1, s4, v1 +; SI-NEXT: s_cmp_lg_u32 s6, 5 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_lg_u32 s4, 4 +; SI-NEXT: s_cmp_lg_u32 s6, 4 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc ; SI-NEXT: v_mov_b32_e32 v4, s9 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; SI-NEXT: v_and_b32_e32 v4, s5, v4 +; SI-NEXT: v_and_b32_e32 v4, s4, v4 ; SI-NEXT: v_or_b32_e32 v1, v4, v1 ; SI-NEXT: s_lshr_b32 s7, s8, 24 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v1, s6, v1 -; SI-NEXT: s_cmp_lg_u32 s4, 3 +; SI-NEXT: v_and_b32_e32 v1, s5, v1 +; SI-NEXT: s_cmp_lg_u32 s6, 3 ; SI-NEXT: v_or_b32_e32 v1, v1, v0 ; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_lshr_b32 s7, s8, 16 -; SI-NEXT: s_cmp_lg_u32 s4, 2 +; SI-NEXT: s_cmp_lg_u32 s6, 2 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc ; SI-NEXT: v_mov_b32_e32 v4, s7 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc ; SI-NEXT: s_lshr_b32 s7, s8, 8 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: v_and_b32_e32 v4, s5, v4 -; SI-NEXT: s_cmp_lg_u32 s4, 1 +; SI-NEXT: v_and_b32_e32 v4, s4, v4 +; SI-NEXT: s_cmp_lg_u32 s6, 1 ; SI-NEXT: v_or_b32_e32 v0, v4, v0 ; SI-NEXT: v_mov_b32_e32 v4, s7 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_lg_u32 s4, 0 +; SI-NEXT: s_cmp_lg_u32 s6, 0 ; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc ; SI-NEXT: v_mov_b32_e32 v5, s8 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v5, 5, v5, vcc ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; SI-NEXT: v_and_b32_e32 v5, s5, v5 +; SI-NEXT: v_and_b32_e32 v5, s4, v5 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v4, s6, v4 +; SI-NEXT: v_and_b32_e32 v4, s5, v4 ; SI-NEXT: v_or_b32_e32 v0, v4, v0 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v16i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 -; VI-NEXT: s_load_dword s4, s[4:5], 0x20 +; VI-NEXT: s_load_dword s6, s[4:5], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s5, s11, 24 -; VI-NEXT: s_cmp_lg_u32 s4, 15 -; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: s_lshr_b32 s4, s11, 24 +; VI-NEXT: s_cmp_lg_u32 s6, 15 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s5, s11, 16 -; VI-NEXT: s_cmp_lg_u32 s4, 14 +; VI-NEXT: s_lshr_b32 s4, s11, 16 +; VI-NEXT: s_cmp_lg_u32 s6, 14 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s5, s11, 8 +; VI-NEXT: s_lshr_b32 s4, s11, 8 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; VI-NEXT: s_cmp_lg_u32 s4, 13 +; VI-NEXT: s_cmp_lg_u32 s6, 13 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_lg_u32 s4, 12 +; VI-NEXT: s_cmp_lg_u32 s6, 12 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc ; VI-NEXT: v_mov_b32_e32 v2, s11 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc -; VI-NEXT: s_lshr_b32 s5, s10, 24 +; VI-NEXT: s_lshr_b32 s4, s10, 24 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_cmp_lg_u32 s4, 11 +; VI-NEXT: s_cmp_lg_u32 s6, 11 ; VI-NEXT: v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s5, s10, 16 -; VI-NEXT: s_cmp_lg_u32 s4, 10 +; VI-NEXT: s_lshr_b32 s4, s10, 16 +; VI-NEXT: s_cmp_lg_u32 s6, 10 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s5, s10, 8 +; VI-NEXT: s_lshr_b32 s4, s10, 8 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; VI-NEXT: s_cmp_lg_u32 s4, 9 +; VI-NEXT: s_cmp_lg_u32 s6, 9 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_lg_u32 s4, 8 +; VI-NEXT: s_cmp_lg_u32 s6, 8 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc -; VI-NEXT: s_lshr_b32 s5, s9, 24 +; VI-NEXT: s_lshr_b32 s4, s9, 24 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_cmp_lg_u32 s4, 7 +; VI-NEXT: s_cmp_lg_u32 s6, 7 ; VI-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s5, s9, 16 -; VI-NEXT: s_cmp_lg_u32 s4, 6 +; VI-NEXT: s_lshr_b32 s4, s9, 16 +; VI-NEXT: s_cmp_lg_u32 s6, 6 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s5, s9, 8 +; VI-NEXT: s_lshr_b32 s4, s9, 8 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; VI-NEXT: s_cmp_lg_u32 s4, 5 +; VI-NEXT: s_cmp_lg_u32 s6, 5 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_lg_u32 s4, 4 +; VI-NEXT: s_cmp_lg_u32 s6, 4 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc ; VI-NEXT: v_mov_b32_e32 v4, s9 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc -; VI-NEXT: s_lshr_b32 s5, s8, 24 +; VI-NEXT: s_lshr_b32 s4, s8, 24 ; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_cmp_lg_u32 s4, 3 +; VI-NEXT: s_cmp_lg_u32 s6, 3 ; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s5, s8, 16 -; VI-NEXT: s_cmp_lg_u32 s4, 2 +; VI-NEXT: s_lshr_b32 s4, s8, 16 +; VI-NEXT: s_cmp_lg_u32 s6, 2 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s5, s8, 8 +; VI-NEXT: s_lshr_b32 s4, s8, 8 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc -; VI-NEXT: s_cmp_lg_u32 s4, 1 +; VI-NEXT: s_cmp_lg_u32 s6, 1 ; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_lg_u32 s4, 0 +; VI-NEXT: s_cmp_lg_u32 s6, 0 ; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc ; VI-NEXT: v_mov_b32_e32 v5, s8 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1603,41 +1603,41 @@ define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xc -; SI-NEXT: s_load_dword s4, s[4:5], 0x18 +; SI-NEXT: s_load_dword s8, s[4:5], 0x18 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xc +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: v_mov_b32_e32 v1, 0x40200000 -; SI-NEXT: s_mov_b32 s3, 0x100f000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: s_cmp_eq_u32 s4, 1 +; SI-NEXT: s_cmp_eq_u32 s8, 1 +; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; SI-NEXT: v_mov_b32_e32 v0, s10 -; SI-NEXT: s_cmp_eq_u32 s4, 0 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: s_cmp_eq_u32 s8, 0 ; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc -; SI-NEXT: v_mov_b32_e32 v0, s9 +; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v2f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s12, s[4:5], 0x60 +; VI-NEXT: s_load_dword s10, s[4:5], 0x60 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x30 -; VI-NEXT: s_mov_b32 s4, 0 -; VI-NEXT: s_mov_b32 s5, 0x40200000 +; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x30 +; VI-NEXT: s_mov_b32 s8, 0 +; VI-NEXT: s_mov_b32 s9, 0x40200000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s12, 1 -; VI-NEXT: s_cselect_b64 s[6:7], s[10:11], s[4:5] -; VI-NEXT: s_cmp_lg_u32 s12, 0 -; VI-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] +; VI-NEXT: s_cmp_lg_u32 s10, 1 +; VI-NEXT: s_cselect_b64 s[6:7], s[6:7], s[8:9] +; VI-NEXT: s_cmp_lg_u32 s10, 0 +; VI-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -1653,44 +1653,44 @@ define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s6, s[4:5], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 -; SI-NEXT: s_mov_b32 s3, 0x100f000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dword s10, s[4:5], 0x8 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s6, 1 -; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, s10 -; SI-NEXT: s_cmp_eq_u32 s6, 0 -; SI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5] -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_cmp_eq_u32 s10, 1 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: s_cselect_b64 s[8:9], -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: s_cmp_eq_u32 s10, 0 +; SI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[2:3] +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[2:3] +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[4:5], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dword s8, s[4:5], 0x20 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VI-NEXT: s_mov_b32 s7, 0x1100f000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s6, 1 -; VI-NEXT: s_cselect_b64 s[4:5], s[10:11], 5 -; VI-NEXT: s_cmp_lg_u32 s6, 0 -; VI-NEXT: s_cselect_b64 s[6:7], s[8:9], 5 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: s_cmp_lg_u32 s8, 1 +; VI-NEXT: s_cselect_b64 s[2:3], s[2:3], 5 +; VI-NEXT: s_cmp_lg_u32 s8, 0 +; VI-NEXT: s_cselect_b64 s[0:1], s[0:1], 5 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <2 x i64> %a, i64 5, i32 %b store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8 @@ -1700,57 +1700,57 @@ define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v3i64: ; SI: ; %bb.0: +; SI-NEXT: s_load_dword s12, s[4:5], 0x10 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xc -; SI-NEXT: s_load_dword s12, s[4:5], 0x10 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xc ; SI-NEXT: s_mov_b32 s3, 0x100f000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: v_mov_b32_e32 v4, s7 ; SI-NEXT: s_cmp_eq_u32 s12, 1 -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] +; SI-NEXT: s_cselect_b64 s[6:7], -1, 0 +; SI-NEXT: v_mov_b32_e32 v0, s11 +; SI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[6:7] ; SI-NEXT: v_mov_b32_e32 v0, s10 ; SI-NEXT: s_cmp_eq_u32 s12, 0 -; SI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5] +; SI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[6:7] ; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] +; SI-NEXT: s_cselect_b64 s[6:7], -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[6:7] ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: s_cmp_eq_u32 s12, 2 -; SI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5] -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v5, v4, 0, s[4:5] -; SI-NEXT: v_mov_b32_e32 v4, s6 -; SI-NEXT: v_cndmask_b32_e64 v4, v4, 5, s[4:5] +; SI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[6:7] +; SI-NEXT: v_mov_b32_e32 v4, s5 +; SI-NEXT: s_cselect_b64 s[6:7], -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v5, v4, 0, s[6:7] +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_cndmask_b32_e64 v4, v4, 5, s[6:7] ; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v3i64: ; VI: ; %bb.0: +; VI-NEXT: s_load_dword s12, s[4:5], 0x40 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x20 -; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x30 -; VI-NEXT: s_load_dword s12, s[4:5], 0x40 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x30 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s12, 1 -; VI-NEXT: s_cselect_b64 s[4:5], s[10:11], 5 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_cselect_b64 s[6:7], s[10:11], 5 ; VI-NEXT: s_cmp_lg_u32 s12, 0 ; VI-NEXT: s_cselect_b64 s[8:9], s[8:9], 5 ; VI-NEXT: s_cmp_lg_u32 s12, 2 -; VI-NEXT: s_cselect_b64 s[6:7], s[6:7], 5 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: s_cselect_b64 s[4:5], s[4:5], 5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <3 x i64> %a, i64 5, i32 %b @@ -1761,36 +1761,36 @@ define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v4f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dword s6, s[4:5], 0x10 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 -; SI-NEXT: s_load_dword s4, s[4:5], 0x10 ; SI-NEXT: v_mov_b32_e32 v4, 0x40200000 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s3, 0x100f000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_cmp_eq_u32 s6, 1 ; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: s_cmp_eq_u32 s4, 1 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc ; SI-NEXT: v_mov_b32_e32 v0, s10 -; SI-NEXT: s_cmp_eq_u32 s4, 0 +; SI-NEXT: s_cmp_eq_u32 s6, 0 ; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc ; SI-NEXT: v_mov_b32_e32 v0, s9 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: s_cmp_eq_u32 s4, 3 +; SI-NEXT: s_cmp_eq_u32 s6, 3 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; SI-NEXT: v_mov_b32_e32 v5, s15 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v7, v5, v4, vcc ; SI-NEXT: v_mov_b32_e32 v5, s14 -; SI-NEXT: s_cmp_eq_u32 s4, 2 +; SI-NEXT: s_cmp_eq_u32 s6, 2 ; SI-NEXT: v_cndmask_b32_e64 v6, v5, 0, vcc ; SI-NEXT: v_mov_b32_e32 v5, s13 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc ; SI-NEXT: v_mov_b32_e32 v4, s12 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 @@ -1799,8 +1799,8 @@ ; VI-LABEL: dynamic_insertelement_v4f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s16, s[4:5], 0x40 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s4, 0 ; VI-NEXT: s_mov_b32 s5, 0x40200000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 @@ -1834,15 +1834,14 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) #0 { ; SI-LABEL: dynamic_insertelement_v8f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dword s6, s[4:5], 0x20 ; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10 -; SI-NEXT: s_load_dword s4, s[4:5], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: v_mov_b32_e32 v16, 0x40200000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshl_b32 s4, s6, 1 ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: s_lshl_b32 s4, s4, 1 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_mov_b32_e32 v3, s11 @@ -1860,6 +1859,7 @@ ; SI-NEXT: v_mov_b32_e32 v15, s23 ; SI-NEXT: s_mov_b32 m0, s4 ; SI-NEXT: v_movreld_b32_e32 v0, 0 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_movreld_b32_e32 v1, v16 ; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 @@ -1869,15 +1869,14 @@ ; ; VI-LABEL: dynamic_insertelement_v8f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s6, s[4:5], 0x80 ; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; VI-NEXT: s_load_dword s4, s[4:5], 0x80 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v16, 0x40200000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshl_b32 s4, s6, 1 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: s_lshl_b32 s4, s4, 1 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 @@ -1895,6 +1894,7 @@ ; VI-NEXT: v_mov_b32_e32 v15, s23 ; VI-NEXT: s_mov_b32 m0, s4 ; VI-NEXT: v_movreld_b32_e32 v0, 0 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_movreld_b32_e32 v1, v16 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -20,11 +20,11 @@ ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 -; CIVI-NEXT: s_load_dword s0, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_and_b32 s0, s0, 0xffff0000 +; CIVI-NEXT: s_and_b32 s0, s2, 0xffff0000 ; CIVI-NEXT: s_or_b32 s0, s0, 0x3e7 ; CIVI-NEXT: v_mov_b32_e32 v2, s0 ; CIVI-NEXT: flat_store_dword v[0:1], v2 @@ -55,13 +55,13 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_load_dword s0, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_and_b32 s1, s4, 0xffff +; VI-NEXT: s_and_b32 s0, s4, 0xffff ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s0, s0, 0xffff0000 -; VI-NEXT: s_or_b32 s0, s1, s0 +; VI-NEXT: s_and_b32 s1, s2, 0xffff0000 +; VI-NEXT: s_or_b32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -110,18 +110,18 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_load_dword s0, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_and_b32 s1, s4, 0xffff +; VI-NEXT: s_and_b32 s0, s4, 0xffff ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s2, s0, 16 -; VI-NEXT: s_and_b32 s0, s0, 0xffff0000 -; VI-NEXT: s_or_b32 s0, s1, s0 +; VI-NEXT: s_lshr_b32 s1, s2, 16 +; VI-NEXT: s_and_b32 s2, s2, 0xffff0000 +; VI-NEXT: s_or_b32 s0, s0, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: ;;#ASMSTART -; VI-NEXT: ; use s2 +; VI-NEXT: ; use s1 ; VI-NEXT: ;;#ASMEND ; VI-NEXT: s_endpgm ; @@ -172,13 +172,13 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_load_dword s0, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_lshr_b32 s1, s4, 16 +; VI-NEXT: s_lshr_b32 s0, s4, 16 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s0, s0, 0xffff0000 -; VI-NEXT: s_or_b32 s0, s1, s0 +; VI-NEXT: s_and_b32 s1, s2, 0xffff0000 +; VI-NEXT: s_or_b32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -298,21 +298,21 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_load_dword s0, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_lshr_b32 s1, s4, 16 +; VI-NEXT: s_lshr_b32 s0, s4, 16 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s2, s0, 16 -; VI-NEXT: s_and_b32 s0, s0, 0xffff0000 -; VI-NEXT: s_or_b32 s0, s1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_lshr_b32 s1, s2, 16 +; VI-NEXT: s_and_b32 s2, s2, 0xffff0000 +; VI-NEXT: s_or_b32 s2, s0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: ;;#ASMSTART -; VI-NEXT: ; use s1 +; VI-NEXT: ; use s0 ; VI-NEXT: ;;#ASMEND ; VI-NEXT: ;;#ASMSTART -; VI-NEXT: ; use s2 +; VI-NEXT: ; use s1 ; VI-NEXT: ;;#ASMEND ; VI-NEXT: s_endpgm ; @@ -321,20 +321,20 @@ ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: s_load_dword s0, s[2:3], 0x0 -; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: s_lshr_b32 s1, s4, 16 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_lshr_b32 s0, s4, 16 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s0, s0, 16 -; CI-NEXT: v_alignbit_b32 v2, s0, v2, 16 +; CI-NEXT: s_lshr_b32 s1, s2, 16 +; CI-NEXT: v_alignbit_b32 v2, s1, v2, 16 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: ;;#ASMSTART -; CI-NEXT: ; use s1 +; CI-NEXT: ; use s0 ; CI-NEXT: ;;#ASMEND ; CI-NEXT: ;;#ASMSTART -; CI-NEXT: ; use s0 +; CI-NEXT: ; use s1 ; CI-NEXT: ;;#ASMEND ; CI-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr @@ -368,11 +368,11 @@ ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 -; CIVI-NEXT: s_load_dword s0, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_and_b32 s0, s0, 0xffff +; CIVI-NEXT: s_and_b32 s0, s2, 0xffff ; CIVI-NEXT: s_or_b32 s0, s0, 0x3e70000 ; CIVI-NEXT: v_mov_b32_e32 v2, s0 ; CIVI-NEXT: flat_store_dword v[0:1], v2 @@ -402,13 +402,13 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_load_dword s0, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_lshl_b32 s1, s4, 16 +; VI-NEXT: s_lshl_b32 s0, s4, 16 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s0, s0, 0xffff -; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: s_and_b32 s1, s2, 0xffff +; VI-NEXT: s_or_b32 s0, s1, s0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -452,11 +452,11 @@ ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 -; CIVI-NEXT: s_load_dword s0, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_and_b32 s0, s0, 0xffff0000 +; CIVI-NEXT: s_and_b32 s0, s2, 0xffff0000 ; CIVI-NEXT: s_or_b32 s0, s0, 0x4500 ; CIVI-NEXT: v_mov_b32_e32 v2, s0 ; CIVI-NEXT: flat_store_dword v[0:1], v2 @@ -484,11 +484,11 @@ ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 -; CIVI-NEXT: s_load_dword s0, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_and_b32 s0, s0, 0xffff +; CIVI-NEXT: s_and_b32 s0, s2, 0xffff ; CIVI-NEXT: s_or_b32 s0, s0, 0x45000000 ; CIVI-NEXT: v_mov_b32_e32 v2, s0 ; CIVI-NEXT: flat_store_dword v[0:1], v2 @@ -521,14 +521,14 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_or_b32_e32 v0, 0x3e7, v0 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v2, 0x3e7, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2i16_0: @@ -539,14 +539,14 @@ ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v0, v[0:1] -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: flat_load_dword v3, v[0:1] +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; CI-NEXT: v_or_b32_e32 v0, 0x3e7, v0 -; CI-NEXT: flat_store_dword v[2:3], v0 +; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; CI-NEXT: v_or_b32_e32 v2, 0x3e7, v2 +; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -582,15 +582,15 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: s_lshr_b32 s0, s4, 16 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_or_b32_e32 v0, s0, v0 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v2, s0, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2i16_0_reghi: @@ -645,14 +645,14 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_or_b32_e32 v0, 53, v0 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v2, 53, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2i16_0_inlineimm: @@ -663,14 +663,14 @@ ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v0, v[0:1] -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: flat_load_dword v3, v[0:1] +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; CI-NEXT: v_or_b32_e32 v0, 53, v0 -; CI-NEXT: flat_store_dword v[2:3], v0 +; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; CI-NEXT: v_or_b32_e32 v2, 53, v2 +; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -705,14 +705,14 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, 0x3e70000 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v2, 0x3e70000 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2i16_1: @@ -723,14 +723,14 @@ ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v0, v[0:1] -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: flat_load_dword v3, v[0:1] +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; CI-NEXT: v_or_b32_e32 v0, 0x3e70000, v0 -; CI-NEXT: flat_store_dword v[2:3], v0 +; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; CI-NEXT: v_or_b32_e32 v2, 0x3e70000, v2 +; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -763,14 +763,14 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, 0xfff10000 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v2, 0xfff10000 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2i16_1_inlineimm: @@ -781,14 +781,14 @@ ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v0, v[0:1] -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: flat_load_dword v3, v[0:1] +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; CI-NEXT: v_or_b32_e32 v0, 0xfff10000, v0 -; CI-NEXT: flat_store_dword v[2:3], v0 +; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; CI-NEXT: v_or_b32_e32 v2, 0xfff10000, v2 +; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -822,14 +822,14 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_or_b32_e32 v0, 0x4500, v0 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v2, 0x4500, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2f16_0: @@ -840,14 +840,14 @@ ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v0, v[0:1] -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: flat_load_dword v3, v[0:1] +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; CI-NEXT: v_or_b32_e32 v0, 0x4500, v0 -; CI-NEXT: flat_store_dword v[2:3], v0 +; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; CI-NEXT: v_or_b32_e32 v2, 0x4500, v2 +; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -880,14 +880,14 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_or_b32_e32 v0, 53, v0 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v2, 53, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2f16_0_inlineimm: @@ -898,14 +898,14 @@ ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v0, v[0:1] -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: flat_load_dword v3, v[0:1] +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; CI-NEXT: v_or_b32_e32 v0, 53, v0 -; CI-NEXT: flat_store_dword v[2:3], v0 +; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; CI-NEXT: v_or_b32_e32 v2, 53, v2 +; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -939,14 +939,14 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, 0x45000000 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v2, 0x45000000 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2f16_1: @@ -957,14 +957,14 @@ ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v0, v[0:1] -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: flat_load_dword v3, v[0:1] +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; CI-NEXT: v_or_b32_e32 v0, 0x45000000, v0 -; CI-NEXT: flat_store_dword v[2:3], v0 +; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; CI-NEXT: v_or_b32_e32 v2, 0x45000000, v2 +; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -997,14 +997,14 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, 0x230000 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v2, 0x230000 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2f16_1_inlineimm: @@ -1015,14 +1015,14 @@ ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v0, v[0:1] -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: flat_load_dword v3, v[0:1] +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; CI-NEXT: v_or_b32_e32 v0, 0x230000, v0 -; CI-NEXT: flat_store_dword v[2:3], v0 +; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; CI-NEXT: v_or_b32_e32 v2, 0x230000, v2 +; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1038,8 +1038,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 addrspace(4)* %idx.ptr) #0 { ; GFX9-LABEL: s_insertelement_v2i16_dynamic: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 @@ -1056,15 +1056,15 @@ ; ; VI-LABEL: s_insertelement_v2i16_dynamic: ; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_load_dword s0, s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x0 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s0, s0, 4 +; VI-NEXT: s_lshl_b32 s0, s4, 4 ; VI-NEXT: s_lshl_b32 s0, 0xffff, s0 ; VI-NEXT: s_andn2_b32 s1, s2, s0 ; VI-NEXT: s_and_b32 s0, s0, 0x3e703e7 @@ -1075,15 +1075,15 @@ ; ; CI-LABEL: s_insertelement_v2i16_dynamic: ; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: s_load_dword s0, s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[6:7], 0x0 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 +; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshl_b32 s0, s0, 4 +; CI-NEXT: s_lshl_b32 s0, s4, 4 ; CI-NEXT: s_lshl_b32 s0, 0xffff, s0 ; CI-NEXT: s_andn2_b32 s1, s2, s0 ; CI-NEXT: s_and_b32 s0, s0, 0x3e703e7 @@ -1123,16 +1123,16 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: s_lshl_b32 s0, s4, 4 -; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_lshl_b32 s0, 0xffff, s0 -; VI-NEXT: v_mov_b32_e32 v1, 0x3e703e7 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v2, 0x3e703e7 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_bfi_b32 v0, s0, v1, v0 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_bfi_b32 v2, s0, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2i16_dynamic_sgpr: @@ -1144,16 +1144,16 @@ ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v0, v[0:1] -; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; CI-NEXT: flat_load_dword v3, v[0:1] +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 ; CI-NEXT: s_lshl_b32 s0, s4, 4 -; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_lshl_b32 s0, 0xffff, s0 -; CI-NEXT: v_mov_b32_e32 v1, 0x3e703e7 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: v_mov_b32_e32 v2, 0x3e703e7 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_bfi_b32 v0, s0, v1, v0 -; CI-NEXT: flat_store_dword v[2:3], v0 +; CI-NEXT: v_bfi_b32 v2, s0, v2, v3 +; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1168,8 +1168,8 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 { ; GFX9-LABEL: v_insertelement_v2f16_dynamic_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] @@ -1188,23 +1188,23 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 -; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b32 s2, 0xffff +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; VI-NEXT: s_mov_b32 s0, 0x12341234 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v4 ; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_bfi_b32 v2, v2, s0, v3 @@ -1215,22 +1215,22 @@ ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 -; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: v_mov_b32_e32 v1, s5 +; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v3, s5 -; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CI-NEXT: flat_load_dword v2, v[2:3] +; CI-NEXT: flat_load_dword v4, v[0:1] +; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; CI-NEXT: flat_load_dword v3, v[0:1] +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v4 ; CI-NEXT: s_mov_b32 s0, 0x12341234 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; CI-NEXT: v_lshlrev_b32_e32 v2, 4, v4 ; CI-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_bfi_b32 v2, v2, s0, v3 @@ -1577,17 +1577,17 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 { ; GFX9-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: global_load_dword v2, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] ; GFX9-NEXT: s_mov_b64 s[2:3], 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s6, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v1, v3, s2, v1 ; GFX9-NEXT: v_bfi_b32 v0, v2, s2, v0 @@ -1597,14 +1597,14 @@ ; VI-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: flat_load_dword v4, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_mov_b64 s[2:3], 0xffff ; VI-NEXT: v_mov_b32_e32 v3, s1 @@ -1624,14 +1624,14 @@ ; CI-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dword s4, s[4:5], 0x4 +; CI-NEXT: flat_load_dword v4, v[0:1] glc +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v4, v[0:1] glc -; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: s_mov_b64 s[2:3], 0xffff ; CI-NEXT: v_mov_b32_e32 v3, s1 diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -20,20 +20,20 @@ ; ; VI-LABEL: i8_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_and_b32 s0, s0, 0xff -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_and_b32 s2, s2, 0xff +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: i8_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xff @@ -92,20 +92,20 @@ ; ; VI-LABEL: i8_zext_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_and_b32 s0, s0, 0xff -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_and_b32 s2, s2, 0xff +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: i8_zext_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xff @@ -167,20 +167,20 @@ ; ; VI-LABEL: i8_sext_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_sext_i32_i8 s0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_sext_i32_i8 s2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: i8_sext_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i8 s2, s2 @@ -242,20 +242,20 @@ ; ; VI-LABEL: i16_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_and_b32 s0, s0, 0xffff -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: i16_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff @@ -314,20 +314,20 @@ ; ; VI-LABEL: i16_zext_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_and_b32 s0, s0, 0xffff -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: i16_zext_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff @@ -389,20 +389,20 @@ ; ; VI-LABEL: i16_sext_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_sext_i32_i16 s0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_sext_i32_i16 s2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: i16_sext_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s2, s2 @@ -474,8 +474,8 @@ ; ; GFX9-LABEL: i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -533,8 +533,8 @@ ; ; GFX9-LABEL: f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -592,8 +592,8 @@ ; ; GFX9-LABEL: v2i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -691,8 +691,8 @@ ; ; GFX9-LABEL: v2i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -772,13 +772,13 @@ ; ; GFX9-LABEL: v2i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v2i32_arg: @@ -836,13 +836,13 @@ ; ; GFX9-LABEL: v2f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v2f32_arg: @@ -890,26 +890,26 @@ ; ; VI-LABEL: v3i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_lshr_b32 s1, s0, 16 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: s_add_u32 s0, s2, 2 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: s_lshr_b32 s3, s2, 16 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_add_u32 s0, s0, 2 +; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_byte v[2:3], v5 ; VI-NEXT: flat_store_short v[0:1], v4 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v3i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -1046,14 +1046,14 @@ ; ; GFX9-LABEL: v3i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:4 -; GFX9-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] offset:4 +; GFX9-NEXT: global_store_dword v0, v2, s[2:3] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v3i16_arg: @@ -1150,21 +1150,21 @@ ; ; VI-LABEL: v3i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s4 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s0 ; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v3i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1229,21 +1229,21 @@ ; ; VI-LABEL: v3f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s4 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s0 ; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v3f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1315,8 +1315,8 @@ ; ; GFX9-LABEL: v4i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -1420,13 +1420,13 @@ ; ; GFX9-LABEL: v4i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v4i16_arg: @@ -1576,8 +1576,8 @@ ; ; GFX9-LABEL: v4i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1650,8 +1650,8 @@ ; ; GFX9-LABEL: v4f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1712,31 +1712,31 @@ ; VI-LABEL: v5i8_arg: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dword s1, s[0:1], 0x30 +; VI-NEXT: s_load_dword s4, s[0:1], 0x30 +; VI-NEXT: s_load_dword s5, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s2, 4 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v4, s1 ; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_byte v[2:3], v4 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v5i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] offset:4 -; GFX9-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] offset:4 +; GFX9-NEXT: global_store_dword v0, v2, s[2:3] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v5i8_arg: @@ -1857,34 +1857,34 @@ ; VI-LABEL: v5i16_arg: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; VI-NEXT: s_load_dword s1, s[0:1], 0x3c +; VI-NEXT: s_load_dword s5, s[0:1], 0x3c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s2, 8 +; VI-NEXT: s_add_u32 s4, s2, 8 +; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: s_addc_u32 s5, s3, 0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: s_addc_u32 s1, s3, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_short v[2:3], v4 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v5i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x18 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_short v2, v3, s[0:1] offset:8 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_short v2, v3, s[2:3] offset:8 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v5i16_arg: @@ -2090,34 +2090,34 @@ ; ; VI-LABEL: v5i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 -; VI-NEXT: s_load_dword s1, s[0:1], 0x54 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s7, s[0:1], 0x54 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s2, 16 -; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: s_add_u32 s6, s4, 16 +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_addc_u32 s7, s5, 0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v5i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 ; GFX9-NEXT: s_load_dword s8, s[4:5], 0x30 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 @@ -2188,31 +2188,31 @@ ; ; VI-LABEL: v5f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 -; VI-NEXT: s_load_dword s1, s[0:1], 0x54 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s7, s[0:1], 0x54 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s2, 16 -; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: s_addc_u32 s1, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: flat_store_dword v[1:2], v3 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_add_u32 s6, s4, 16 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: s_addc_u32 s7, s5, 0 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: flat_store_dword v[1:2], v3 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v5f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 -; GFX9-NEXT: s_load_dword s8, s[4:5], 0x30 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -2220,7 +2220,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: global_store_dword v4, v0, s[6:7] offset:16 ; GFX9-NEXT: s_endpgm ; @@ -2294,8 +2294,8 @@ ; ; VI-LABEL: v5i64_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x64 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x84 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 @@ -2437,8 +2437,8 @@ ; ; VI-LABEL: v5f64_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x64 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x84 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 @@ -2580,13 +2580,13 @@ ; ; GFX9-LABEL: v8i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v8i8_arg: @@ -2836,8 +2836,8 @@ ; ; GFX9-LABEL: v8i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -3076,31 +3076,31 @@ ; ; VI-LABEL: v8i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_add_u32 s2, s0, 16 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_add_u32 s4, s8, 16 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_addc_u32 s5, s9, 0 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v8i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s12 @@ -3187,31 +3187,31 @@ ; ; VI-LABEL: v8f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_add_u32 s4, s8, 16 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_addc_u32 s5, s9, 0 -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_add_u32 s2, s0, 16 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v8f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s12 @@ -3307,8 +3307,8 @@ ; ; GFX9-LABEL: v16i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -3749,31 +3749,31 @@ ; ; VI-LABEL: v16i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_add_u32 s2, s0, 16 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_add_u32 s4, s8, 16 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_addc_u32 s5, s9, 0 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v16i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s12 @@ -4217,76 +4217,74 @@ ; ; VI-LABEL: v16i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx16 s[0:15], s[0:1], 0x64 +; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_add_u32 s2, s0, 48 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: s_add_u32 s2, s0, 32 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: v_mov_b32_e32 v0, s12 -; VI-NEXT: s_add_u32 s12, s16, 48 ; VI-NEXT: v_mov_b32_e32 v1, s13 -; VI-NEXT: s_addc_u32 s13, s17, 0 -; VI-NEXT: v_mov_b32_e32 v4, s12 ; VI-NEXT: v_mov_b32_e32 v2, s14 ; VI-NEXT: v_mov_b32_e32 v3, s15 -; VI-NEXT: v_mov_b32_e32 v5, s13 +; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: s_add_u32 s8, s16, 32 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: s_addc_u32 s9, s17, 0 -; VI-NEXT: v_mov_b32_e32 v4, s8 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_add_u32 s4, s16, 16 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_addc_u32 s5, s17, 0 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v4, s16 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v16i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_load_dwordx16 s[0:15], s[4:5], 0x40 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s20 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: v_mov_b32_e32 v3, s23 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s12 ; GFX9-NEXT: v_mov_b32_e32 v1, s13 ; GFX9-NEXT: v_mov_b32_e32 v2, s14 ; GFX9-NEXT: v_mov_b32_e32 v3, s15 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:48 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:32 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v16i32_arg: @@ -4407,76 +4405,74 @@ ; ; VI-LABEL: v16f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx16 s[0:15], s[0:1], 0x64 +; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_add_u32 s2, s0, 48 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: s_add_u32 s2, s0, 32 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: v_mov_b32_e32 v0, s12 -; VI-NEXT: s_add_u32 s12, s16, 48 ; VI-NEXT: v_mov_b32_e32 v1, s13 -; VI-NEXT: s_addc_u32 s13, s17, 0 -; VI-NEXT: v_mov_b32_e32 v4, s12 ; VI-NEXT: v_mov_b32_e32 v2, s14 ; VI-NEXT: v_mov_b32_e32 v3, s15 -; VI-NEXT: v_mov_b32_e32 v5, s13 +; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: s_add_u32 s8, s16, 32 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: s_addc_u32 s9, s17, 0 -; VI-NEXT: v_mov_b32_e32 v4, s8 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_add_u32 s4, s16, 16 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_addc_u32 s5, s17, 0 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v4, s16 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v16f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_load_dwordx16 s[0:15], s[4:5], 0x40 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s20 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: v_mov_b32_e32 v3, s23 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s12 ; GFX9-NEXT: v_mov_b32_e32 v1, s13 ; GFX9-NEXT: v_mov_b32_e32 v2, s14 ; GFX9-NEXT: v_mov_b32_e32 v3, s15 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:48 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:32 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v16f32_arg: @@ -4715,37 +4711,37 @@ ; ; VI-LABEL: i65_arg: ; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_and_b32 s4, s4, 1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: s_and_b32 s1, s0, 1 -; VI-NEXT: s_add_u32 s0, s2, 8 -; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: s_add_u32 s2, s2, 8 +; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_store_byte v[2:3], v4 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_byte v[2:3], v4 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: i65_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_and_b32 s4, s6, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_byte v2, v3, s[0:1] offset:8 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_byte v2, v3, s[2:3] offset:8 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: i65_arg: @@ -4830,20 +4826,20 @@ ; ; VI-LABEL: i1_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_and_b32 s0, s0, 1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_and_b32 s2, s2, 1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: i1_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 1 @@ -4921,20 +4917,20 @@ ; ; VI-LABEL: i1_arg_zext_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_and_b32 s0, s0, 1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_and_b32 s2, s2, 1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: i1_arg_zext_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 1 @@ -4994,21 +4990,21 @@ ; ; VI-LABEL: i1_arg_zext_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: s_and_b32 s0, s0, 1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_and_b32 s2, s2, 1 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: i1_arg_zext_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 1 @@ -5069,20 +5065,20 @@ ; ; VI-LABEL: i1_arg_sext_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_bfe_i32 s0, s0, 0x10000 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_bfe_i32 s2, s2, 0x10000 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: i1_arg_sext_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s2, s2, 0x10000 @@ -5144,27 +5140,27 @@ ; ; VI-LABEL: i1_arg_sext_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: i1_arg_sext_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: i1_arg_sext_i64: @@ -5423,23 +5419,23 @@ ; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v6, s2 -; VI-NEXT: s_load_dword s4, s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x28 ; VI-NEXT: flat_load_ubyte v8, v[0:1] ; VI-NEXT: flat_load_ubyte v9, v[2:3] ; VI-NEXT: flat_load_ubyte v10, v[4:5] ; VI-NEXT: flat_load_ubyte v6, v[6:7] -; VI-NEXT: s_add_u32 s0, s0, 53 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_add_u32 s2, s0, 53 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_load_dword s2, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x28 ; VI-NEXT: v_mov_b32_e32 v2, 0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v3, 0 -; VI-NEXT: v_mov_b32_e32 v7, s4 -; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, s2 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dword v[2:3], v7 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5] @@ -5458,10 +5454,10 @@ ; GFX9-LABEL: packed_struct_argument_alignment: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v6, v2, s[4:5] offset:13 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:17 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4 -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:17 -; GFX9-NEXT: global_load_dword v6, v2, s[4:5] offset:13 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5747,20 +5743,20 @@ ; VI-LABEL: array_3xi32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[0:1], 0x24 -; VI-NEXT: s_load_dword s3, s[0:1], 0x28 -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x30 +; VI-NEXT: s_load_dword s3, s[0:1], 0x30 +; VI-NEXT: s_load_dword s4, s[0:1], 0x28 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_short v[0:1], v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dword v[0:1], v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: flat_store_dword v[0:1], v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: flat_store_dword v[0:1], v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm @@ -5768,20 +5764,20 @@ ; GFX9-LABEL: array_3xi32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[4:5], 0x4 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dword s3, s[4:5], 0xc +; GFX9-NEXT: s_load_dword s1, s[4:5], 0xc +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x4 +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_short v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v[0:1], v1, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -5870,26 +5866,26 @@ ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: s_add_u32 s4, s2, 2 ; VI-NEXT: s_addc_u32 s5, s3, 0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_add_u32 s2, s0, 42 +; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: s_add_u32 s2, s0, 42 +; VI-NEXT: flat_load_ushort v4, v[0:1] +; VI-NEXT: flat_load_ushort v2, v[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: flat_load_ushort v1, v[2:3] -; VI-NEXT: flat_load_ushort v2, v[4:5] ; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: flat_store_byte v[0:1], v3 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_store_byte v[0:1], v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_store_short v[0:1], v1 +; VI-NEXT: flat_store_short v[0:1], v4 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_short v[0:1], v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5898,20 +5894,20 @@ ; GFX9-LABEL: array_3xi16: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:2 +; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:6 ; GFX9-NEXT: global_load_ushort v2, v0, s[4:5] offset:4 -; GFX9-NEXT: global_load_ushort v3, v0, s[4:5] offset:6 +; GFX9-NEXT: global_load_ushort v3, v0, s[4:5] offset:2 ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_short v[0:1], v3, off +; GFX9-NEXT: global_store_short v[0:1], v1, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_short v[0:1], v1, off +; GFX9-NEXT: global_store_short v[0:1], v3, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; @@ -6103,15 +6099,15 @@ ; ; GFX9-LABEL: byref_align_constant_i32_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x100 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-NEXT: global_store_dword v0, v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; @@ -6194,58 +6190,58 @@ ; ; VI-LABEL: byref_natural_align_constant_v16i32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 -; VI-NEXT: s_load_dword s18, s[0:1], 0xa4 -; VI-NEXT: s_load_dwordx16 s[0:15], s[0:1], 0x64 +; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s20, s[0:1], 0xa4 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_add_u32 s0, s2, 48 +; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_add_u32 s0, s2, 32 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_add_u32 s0, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s12 -; VI-NEXT: s_add_u32 s12, s16, 48 ; VI-NEXT: v_mov_b32_e32 v1, s13 -; VI-NEXT: s_addc_u32 s13, s17, 0 -; VI-NEXT: v_mov_b32_e32 v4, s12 ; VI-NEXT: v_mov_b32_e32 v2, s14 ; VI-NEXT: v_mov_b32_e32 v3, s15 -; VI-NEXT: v_mov_b32_e32 v5, s13 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: s_add_u32 s8, s16, 32 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: s_addc_u32 s9, s17, 0 -; VI-NEXT: v_mov_b32_e32 v4, s8 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_add_u32 s4, s16, 16 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_addc_u32 s5, s17, 0 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s16 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s18 +; VI-NEXT: v_mov_b32_e32 v0, s20 ; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: byref_natural_align_constant_v16i32_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x80 ; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x80 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s20 diff --git a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll --- a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll @@ -18,8 +18,8 @@ ; FUNC-LABEL: {{^}}v3i8_arg: ; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind { entry: store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4 @@ -29,8 +29,8 @@ ; FUNC-LABEL: {{^}}i65_arg: ; HSA-VI: kernarg_segment_byte_size = 24 ; HSA-VI: kernarg_segment_alignment = 4 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 define amdgpu_kernel void @i65_arg(i65 addrspace(1)* nocapture %out, i65 %in) nounwind { entry: store i65 %in, i65 addrspace(1)* %out, align 4 @@ -76,8 +76,8 @@ ; FUNC-LABEL: {{^}}packed_struct_argument_alignment: ; HSA-VI: kernarg_segment_byte_size = 28 ; HSA-VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; HSA-VI: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:17 ; HSA-VI: global_load_dword v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:13 +; HSA-VI: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:17 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4 define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { @@ -114,9 +114,9 @@ ; GCN-LABEL: {{^}}array_3xi32: ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { store volatile i16 %arg0, i16 addrspace(1)* undef store volatile [3 x i32] %arg1, [3 x i32] addrspace(1)* undef diff --git a/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll --- a/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll @@ -25,25 +25,25 @@ ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s7, 0xe8f000 ; SI-NEXT: s_add_u32 s4, s4, s3 -; SI-NEXT: s_load_dword s2, s[0:1], 0x9 -; SI-NEXT: s_load_dword s3, s[0:1], 0xa -; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dword s3, s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xa ; SI-NEXT: s_addc_u32 s5, s5, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0x42280000 -; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s1, s0, 4 -; SI-NEXT: s_lshl_b32 s0, s0, 3 -; SI-NEXT: s_add_i32 s0, s0, 32 -; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: s_lshl_b32 s1, s2, 4 +; SI-NEXT: s_lshl_b32 s2, s2, 3 +; SI-NEXT: s_add_i32 s2, s2, 32 +; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_min_rtn_f32 v1, v1, v0 ; SI-NEXT: s_add_i32 s1, s1, 64 ; SI-NEXT: v_mov_b32_e32 v2, s1 ; SI-NEXT: ds_min_f32 v2, v0 -; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: s_waitcnt lgkmcnt(1) ; SI-NEXT: ds_min_rtn_f32 v0, v0, v1 -; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen ; SI-NEXT: s_endpgm @@ -52,25 +52,25 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 ; GFX7-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dword s2, s[0:1], 0xb +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xe8f000 ; GFX7-NEXT: s_add_u32 s4, s4, s3 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GFX7-NEXT: s_load_dword s0, s[0:1], 0xb ; GFX7-NEXT: s_addc_u32 s5, s5, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b32 s3, s2, 3 ; GFX7-NEXT: v_mov_b32_e32 v0, 0x42280000 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshl_b32 s1, s0, 4 -; GFX7-NEXT: s_lshl_b32 s0, s0, 3 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: ds_min_rtn_f32 v1, v1, v0 offset:32 -; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_lshl_b32 s2, s2, 4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: ds_min_f32 v2, v0 offset:64 -; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: s_waitcnt lgkmcnt(1) ; GFX7-NEXT: ds_min_rtn_f32 v0, v0, v1 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen ; GFX7-NEXT: s_endpgm @@ -79,25 +79,25 @@ ; VI: ; %bb.0: ; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s90, -1 ; VI-NEXT: s_mov_b32 s91, 0xe80000 ; VI-NEXT: s_add_u32 s88, s88, s3 -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_addc_u32 s89, s89, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshl_b32 s3, s2, 3 ; VI-NEXT: v_mov_b32_e32 v0, 0x42280000 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_mov_b32 m0, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s1, s0, 4 -; VI-NEXT: s_lshl_b32 s0, s0, 3 -; VI-NEXT: v_mov_b32_e32 v1, s0 ; VI-NEXT: ds_min_rtn_f32 v1, v1, v0 offset:32 -; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: s_lshl_b32 s2, s2, 4 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: ds_min_f32 v2, v0 offset:64 -; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: s_waitcnt lgkmcnt(1) ; VI-NEXT: ds_min_rtn_f32 v0, v0, v1 -; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v1, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen ; VI-NEXT: s_endpgm @@ -109,13 +109,13 @@ ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000 ; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42280000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s1, s4, 3 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_lshl_b32 s0, s4, 3 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: ds_min_rtn_f32 v1, v1, v0 offset:32 ; GFX9-NEXT: s_lshl_b32 s0, s4, 4 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 @@ -158,23 +158,23 @@ ; G_SI-LABEL: lds_ds_fmin: ; G_SI: ; %bb.0: ; G_SI-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; G_SI-NEXT: s_load_dword s2, s[0:1], 0xb ; G_SI-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 ; G_SI-NEXT: s_mov_b32 s6, -1 ; G_SI-NEXT: s_mov_b32 s7, 0xe8f000 ; G_SI-NEXT: s_add_u32 s4, s4, s3 -; G_SI-NEXT: s_load_dword s2, s[0:1], 0xb -; G_SI-NEXT: s_load_dword s3, s[0:1], 0xa -; G_SI-NEXT: s_load_dword s0, s[0:1], 0x9 ; G_SI-NEXT: s_addc_u32 s5, s5, 0 -; G_SI-NEXT: v_mov_b32_e32 v0, 0x42280000 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: s_add_i32 s2, s2, 4 +; G_SI-NEXT: s_load_dword s3, s[0:1], 0xa +; G_SI-NEXT: s_load_dword s0, s[0:1], 0x9 ; G_SI-NEXT: s_lshl_b32 s1, s2, 3 +; G_SI-NEXT: v_mov_b32_e32 v0, 0x42280000 ; G_SI-NEXT: v_mov_b32_e32 v1, s1 ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: ds_min_rtn_f32 v1, v1, v0 -; G_SI-NEXT: s_lshl_b32 s2, s2, 4 -; G_SI-NEXT: v_mov_b32_e32 v2, s2 +; G_SI-NEXT: s_lshl_b32 s1, s2, 4 +; G_SI-NEXT: v_mov_b32_e32 v2, s1 ; G_SI-NEXT: ds_min_rtn_f32 v0, v2, v0 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s3 @@ -186,56 +186,56 @@ ; ; G_GFX7-LABEL: lds_ds_fmin: ; G_GFX7: ; %bb.0: -; G_GFX7-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; G_GFX7-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x9 +; G_GFX7-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; G_GFX7-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 ; G_GFX7-NEXT: s_load_dword s2, s[0:1], 0xb -; G_GFX7-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; G_GFX7-NEXT: s_mov_b32 s10, -1 -; G_GFX7-NEXT: s_mov_b32 s11, 0xe8f000 -; G_GFX7-NEXT: s_add_u32 s8, s8, s3 -; G_GFX7-NEXT: s_addc_u32 s9, s9, 0 +; G_GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; G_GFX7-NEXT: s_mov_b32 s6, -1 +; G_GFX7-NEXT: s_mov_b32 s7, 0xe8f000 +; G_GFX7-NEXT: s_add_u32 s4, s4, s3 +; G_GFX7-NEXT: s_addc_u32 s5, s5, 0 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: s_add_i32 s0, s2, 4 -; G_GFX7-NEXT: s_lshl_b32 s1, s0, 3 +; G_GFX7-NEXT: s_add_i32 s2, s2, 4 +; G_GFX7-NEXT: s_lshl_b32 s3, s2, 3 ; G_GFX7-NEXT: v_mov_b32_e32 v0, 0x42280000 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX7-NEXT: s_mov_b32 m0, -1 ; G_GFX7-NEXT: ds_min_rtn_f32 v1, v1, v0 -; G_GFX7-NEXT: s_lshl_b32 s0, s0, 4 -; G_GFX7-NEXT: v_mov_b32_e32 v2, s0 +; G_GFX7-NEXT: s_lshl_b32 s2, s2, 4 +; G_GFX7-NEXT: v_mov_b32_e32 v2, s2 ; G_GFX7-NEXT: ds_min_rtn_f32 v0, v2, v0 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: v_mov_b32_e32 v0, s7 +; G_GFX7-NEXT: v_mov_b32_e32 v0, s1 ; G_GFX7-NEXT: ds_min_rtn_f32 v0, v0, v1 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s6 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen +; G_GFX7-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen ; G_GFX7-NEXT: s_endpgm ; ; G_VI-LABEL: lds_ds_fmin: ; G_VI: ; %bb.0: ; G_VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; G_VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 -; G_VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; G_VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; G_VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; G_VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; G_VI-NEXT: s_mov_b32 s90, -1 ; G_VI-NEXT: s_mov_b32 s91, 0xe80000 ; G_VI-NEXT: s_add_u32 s88, s88, s3 ; G_VI-NEXT: s_addc_u32 s89, s89, 0 ; G_VI-NEXT: s_waitcnt lgkmcnt(0) -; G_VI-NEXT: s_add_i32 s0, s2, 4 -; G_VI-NEXT: s_lshl_b32 s1, s0, 3 +; G_VI-NEXT: s_add_i32 s2, s2, 4 +; G_VI-NEXT: s_lshl_b32 s3, s2, 3 ; G_VI-NEXT: v_mov_b32_e32 v0, 0x42280000 -; G_VI-NEXT: v_mov_b32_e32 v1, s1 +; G_VI-NEXT: v_mov_b32_e32 v1, s3 ; G_VI-NEXT: s_mov_b32 m0, -1 ; G_VI-NEXT: ds_min_rtn_f32 v1, v1, v0 -; G_VI-NEXT: s_lshl_b32 s0, s0, 4 -; G_VI-NEXT: v_mov_b32_e32 v2, s0 +; G_VI-NEXT: s_lshl_b32 s2, s2, 4 +; G_VI-NEXT: v_mov_b32_e32 v2, s2 ; G_VI-NEXT: ds_min_rtn_f32 v0, v2, v0 ; G_VI-NEXT: s_waitcnt lgkmcnt(0) -; G_VI-NEXT: v_mov_b32_e32 v0, s7 +; G_VI-NEXT: v_mov_b32_e32 v0, s1 ; G_VI-NEXT: ds_min_rtn_f32 v0, v0, v1 -; G_VI-NEXT: v_mov_b32_e32 v1, s6 +; G_VI-NEXT: v_mov_b32_e32 v1, s0 ; G_VI-NEXT: s_waitcnt lgkmcnt(0) ; G_VI-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen ; G_VI-NEXT: s_endpgm @@ -243,9 +243,9 @@ ; G_GFX9-LABEL: lds_ds_fmin: ; G_GFX9: ; %bb.0: ; G_GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; G_GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 -; G_GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; G_GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; G_GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; G_GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 ; G_GFX9-NEXT: s_mov_b32 s10, -1 ; G_GFX9-NEXT: s_mov_b32 s11, 0xe00000 ; G_GFX9-NEXT: s_add_u32 s8, s8, s3 @@ -314,25 +314,25 @@ ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s7, 0xe8f000 ; SI-NEXT: s_add_u32 s4, s4, s3 -; SI-NEXT: s_load_dword s2, s[0:1], 0x9 -; SI-NEXT: s_load_dword s3, s[0:1], 0xa -; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dword s3, s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xa ; SI-NEXT: s_addc_u32 s5, s5, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0x42280000 -; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s1, s0, 4 -; SI-NEXT: s_lshl_b32 s0, s0, 3 -; SI-NEXT: s_add_i32 s0, s0, 32 -; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: s_lshl_b32 s1, s2, 4 +; SI-NEXT: s_lshl_b32 s2, s2, 3 +; SI-NEXT: s_add_i32 s2, s2, 32 +; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_max_rtn_f32 v1, v1, v0 ; SI-NEXT: s_add_i32 s1, s1, 64 ; SI-NEXT: v_mov_b32_e32 v2, s1 ; SI-NEXT: ds_max_f32 v2, v0 -; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: s_waitcnt lgkmcnt(1) ; SI-NEXT: ds_max_rtn_f32 v0, v0, v1 -; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen ; SI-NEXT: s_endpgm @@ -341,25 +341,25 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 ; GFX7-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dword s2, s[0:1], 0xb +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xe8f000 ; GFX7-NEXT: s_add_u32 s4, s4, s3 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GFX7-NEXT: s_load_dword s0, s[0:1], 0xb ; GFX7-NEXT: s_addc_u32 s5, s5, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b32 s3, s2, 3 ; GFX7-NEXT: v_mov_b32_e32 v0, 0x42280000 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshl_b32 s1, s0, 4 -; GFX7-NEXT: s_lshl_b32 s0, s0, 3 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: ds_max_rtn_f32 v1, v1, v0 offset:32 -; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_lshl_b32 s2, s2, 4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: ds_max_f32 v2, v0 offset:64 -; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: s_waitcnt lgkmcnt(1) ; GFX7-NEXT: ds_max_rtn_f32 v0, v0, v1 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen ; GFX7-NEXT: s_endpgm @@ -368,25 +368,25 @@ ; VI: ; %bb.0: ; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s90, -1 ; VI-NEXT: s_mov_b32 s91, 0xe80000 ; VI-NEXT: s_add_u32 s88, s88, s3 -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_addc_u32 s89, s89, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshl_b32 s3, s2, 3 ; VI-NEXT: v_mov_b32_e32 v0, 0x42280000 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_mov_b32 m0, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s1, s0, 4 -; VI-NEXT: s_lshl_b32 s0, s0, 3 -; VI-NEXT: v_mov_b32_e32 v1, s0 ; VI-NEXT: ds_max_rtn_f32 v1, v1, v0 offset:32 -; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: s_lshl_b32 s2, s2, 4 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: ds_max_f32 v2, v0 offset:64 -; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: s_waitcnt lgkmcnt(1) ; VI-NEXT: ds_max_rtn_f32 v0, v0, v1 -; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v1, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen ; VI-NEXT: s_endpgm @@ -398,13 +398,13 @@ ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000 ; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42280000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s1, s4, 3 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_lshl_b32 s0, s4, 3 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: ds_max_rtn_f32 v1, v1, v0 offset:32 ; GFX9-NEXT: s_lshl_b32 s0, s4, 4 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 @@ -447,23 +447,23 @@ ; G_SI-LABEL: lds_ds_fmax: ; G_SI: ; %bb.0: ; G_SI-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; G_SI-NEXT: s_load_dword s2, s[0:1], 0xb ; G_SI-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 ; G_SI-NEXT: s_mov_b32 s6, -1 ; G_SI-NEXT: s_mov_b32 s7, 0xe8f000 ; G_SI-NEXT: s_add_u32 s4, s4, s3 -; G_SI-NEXT: s_load_dword s2, s[0:1], 0xb -; G_SI-NEXT: s_load_dword s3, s[0:1], 0xa -; G_SI-NEXT: s_load_dword s0, s[0:1], 0x9 ; G_SI-NEXT: s_addc_u32 s5, s5, 0 -; G_SI-NEXT: v_mov_b32_e32 v0, 0x42280000 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: s_add_i32 s2, s2, 4 +; G_SI-NEXT: s_load_dword s3, s[0:1], 0xa +; G_SI-NEXT: s_load_dword s0, s[0:1], 0x9 ; G_SI-NEXT: s_lshl_b32 s1, s2, 3 +; G_SI-NEXT: v_mov_b32_e32 v0, 0x42280000 ; G_SI-NEXT: v_mov_b32_e32 v1, s1 ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: ds_max_rtn_f32 v1, v1, v0 -; G_SI-NEXT: s_lshl_b32 s2, s2, 4 -; G_SI-NEXT: v_mov_b32_e32 v2, s2 +; G_SI-NEXT: s_lshl_b32 s1, s2, 4 +; G_SI-NEXT: v_mov_b32_e32 v2, s1 ; G_SI-NEXT: ds_max_rtn_f32 v0, v2, v0 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s3 @@ -475,56 +475,56 @@ ; ; G_GFX7-LABEL: lds_ds_fmax: ; G_GFX7: ; %bb.0: -; G_GFX7-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; G_GFX7-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x9 +; G_GFX7-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; G_GFX7-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 ; G_GFX7-NEXT: s_load_dword s2, s[0:1], 0xb -; G_GFX7-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; G_GFX7-NEXT: s_mov_b32 s10, -1 -; G_GFX7-NEXT: s_mov_b32 s11, 0xe8f000 -; G_GFX7-NEXT: s_add_u32 s8, s8, s3 -; G_GFX7-NEXT: s_addc_u32 s9, s9, 0 +; G_GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; G_GFX7-NEXT: s_mov_b32 s6, -1 +; G_GFX7-NEXT: s_mov_b32 s7, 0xe8f000 +; G_GFX7-NEXT: s_add_u32 s4, s4, s3 +; G_GFX7-NEXT: s_addc_u32 s5, s5, 0 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: s_add_i32 s0, s2, 4 -; G_GFX7-NEXT: s_lshl_b32 s1, s0, 3 +; G_GFX7-NEXT: s_add_i32 s2, s2, 4 +; G_GFX7-NEXT: s_lshl_b32 s3, s2, 3 ; G_GFX7-NEXT: v_mov_b32_e32 v0, 0x42280000 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX7-NEXT: s_mov_b32 m0, -1 ; G_GFX7-NEXT: ds_max_rtn_f32 v1, v1, v0 -; G_GFX7-NEXT: s_lshl_b32 s0, s0, 4 -; G_GFX7-NEXT: v_mov_b32_e32 v2, s0 +; G_GFX7-NEXT: s_lshl_b32 s2, s2, 4 +; G_GFX7-NEXT: v_mov_b32_e32 v2, s2 ; G_GFX7-NEXT: ds_max_rtn_f32 v0, v2, v0 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: v_mov_b32_e32 v0, s7 +; G_GFX7-NEXT: v_mov_b32_e32 v0, s1 ; G_GFX7-NEXT: ds_max_rtn_f32 v0, v0, v1 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s6 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen +; G_GFX7-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen ; G_GFX7-NEXT: s_endpgm ; ; G_VI-LABEL: lds_ds_fmax: ; G_VI: ; %bb.0: ; G_VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; G_VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 -; G_VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; G_VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; G_VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; G_VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; G_VI-NEXT: s_mov_b32 s90, -1 ; G_VI-NEXT: s_mov_b32 s91, 0xe80000 ; G_VI-NEXT: s_add_u32 s88, s88, s3 ; G_VI-NEXT: s_addc_u32 s89, s89, 0 ; G_VI-NEXT: s_waitcnt lgkmcnt(0) -; G_VI-NEXT: s_add_i32 s0, s2, 4 -; G_VI-NEXT: s_lshl_b32 s1, s0, 3 +; G_VI-NEXT: s_add_i32 s2, s2, 4 +; G_VI-NEXT: s_lshl_b32 s3, s2, 3 ; G_VI-NEXT: v_mov_b32_e32 v0, 0x42280000 -; G_VI-NEXT: v_mov_b32_e32 v1, s1 +; G_VI-NEXT: v_mov_b32_e32 v1, s3 ; G_VI-NEXT: s_mov_b32 m0, -1 ; G_VI-NEXT: ds_max_rtn_f32 v1, v1, v0 -; G_VI-NEXT: s_lshl_b32 s0, s0, 4 -; G_VI-NEXT: v_mov_b32_e32 v2, s0 +; G_VI-NEXT: s_lshl_b32 s2, s2, 4 +; G_VI-NEXT: v_mov_b32_e32 v2, s2 ; G_VI-NEXT: ds_max_rtn_f32 v0, v2, v0 ; G_VI-NEXT: s_waitcnt lgkmcnt(0) -; G_VI-NEXT: v_mov_b32_e32 v0, s7 +; G_VI-NEXT: v_mov_b32_e32 v0, s1 ; G_VI-NEXT: ds_max_rtn_f32 v0, v0, v1 -; G_VI-NEXT: v_mov_b32_e32 v1, s6 +; G_VI-NEXT: v_mov_b32_e32 v1, s0 ; G_VI-NEXT: s_waitcnt lgkmcnt(0) ; G_VI-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen ; G_VI-NEXT: s_endpgm @@ -532,9 +532,9 @@ ; G_GFX9-LABEL: lds_ds_fmax: ; G_GFX9: ; %bb.0: ; G_GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; G_GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 -; G_GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; G_GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; G_GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; G_GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 ; G_GFX9-NEXT: s_mov_b32 s10, -1 ; G_GFX9-NEXT: s_mov_b32 s11, 0xe00000 ; G_GFX9-NEXT: s_add_u32 s8, s8, s3 @@ -603,64 +603,64 @@ ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s11, 0xe8f000 ; SI-NEXT: s_add_u32 s8, s8, s3 -; SI-NEXT: s_load_dword s2, s[0:1], 0x9 -; SI-NEXT: s_load_dword s3, s[0:1], 0xa -; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dword s3, s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xa ; SI-NEXT: s_addc_u32 s9, s9, 0 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s1, s0, 4 -; SI-NEXT: s_lshl_b32 s0, s0, 3 -; SI-NEXT: s_add_i32 s4, s0, 32 -; SI-NEXT: s_add_i32 s5, s1, 64 +; SI-NEXT: s_lshl_b32 s0, s2, 3 +; SI-NEXT: s_lshl_b32 s5, s2, 4 +; SI-NEXT: s_add_i32 s2, s0, 32 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s1, 0x40450000 ; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v2, s2 ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] -; SI-NEXT: v_mov_b32_e32 v4, s5 +; SI-NEXT: s_add_i32 s0, s5, 64 +; SI-NEXT: v_mov_b32_e32 v4, s0 ; SI-NEXT: ds_min_f64 v4, v[0:1] -; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_waitcnt lgkmcnt(1) ; SI-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3] -; SI-NEXT: s_add_i32 s0, s2, 4 +; SI-NEXT: s_add_i32 s0, s3, 4 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen ; SI-NEXT: s_endpgm ; ; GFX7-LABEL: lds_ds_fmin_f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX7-NEXT: s_load_dword s0, s[0:1], 0xb ; GFX7-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s11, 0xe8f000 ; GFX7-NEXT: s_add_u32 s8, s8, s3 ; GFX7-NEXT: s_mov_b32 s2, 0 -; GFX7-NEXT: s_addc_u32 s9, s9, 0 ; GFX7-NEXT: s_mov_b32 s3, 0x40450000 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshl_b32 s1, s0, 4 -; GFX7-NEXT: s_lshl_b32 s0, s0, 3 ; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b32 s2, s4, 3 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32 -; GFX7-NEXT: v_mov_b32_e32 v4, s1 +; GFX7-NEXT: s_lshl_b32 s2, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-NEXT: ds_min_f64 v4, v[0:1] offset:64 -; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: s_waitcnt lgkmcnt(1) ; GFX7-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3] -; GFX7-NEXT: s_add_i32 s0, s4, 4 -; GFX7-NEXT: v_mov_b32_e32 v3, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_add_i32 s1, s0, 4 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_store_dword v1, v3, s[8:11], 0 offen ; GFX7-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen @@ -669,31 +669,31 @@ ; VI-LABEL: lds_ds_fmin_f64: ; VI: ; %bb.0: ; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s90, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s91, 0xe80000 ; VI-NEXT: s_add_u32 s88, s88, s3 ; VI-NEXT: s_mov_b32 s2, 0 -; VI-NEXT: s_addc_u32 s89, s89, 0 ; VI-NEXT: s_mov_b32 s3, 0x40450000 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s1, s0, 4 -; VI-NEXT: s_lshl_b32 s0, s0, 3 ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_addc_u32 s89, s89, 0 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshl_b32 s2, s4, 3 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32 -; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: s_lshl_b32 s2, s4, 4 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: ds_min_f64 v4, v[0:1] offset:64 -; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: s_waitcnt lgkmcnt(1) ; VI-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3] -; VI-NEXT: s_add_i32 s0, s4, 4 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_add_i32 s1, s0, 4 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; VI-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen @@ -706,19 +706,19 @@ ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000 ; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_mov_b32 s1, 0x40450000 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s5, s4, 4 -; GFX9-NEXT: s_lshl_b32 s4, s4, 3 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b32 s0, s4, 3 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: s_lshl_b32 s0, s4, 4 +; GFX9-NEXT: v_mov_b32_e32 v5, s0 ; GFX9-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NEXT: ds_min_f64 v5, v[0:1] offset:64 ; GFX9-NEXT: s_waitcnt lgkmcnt(1) @@ -763,32 +763,32 @@ ; G_SI-LABEL: lds_ds_fmin_f64: ; G_SI: ; %bb.0: ; G_SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; G_SI-NEXT: s_load_dword s4, s[0:1], 0xb ; G_SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; G_SI-NEXT: s_mov_b32 s10, -1 ; G_SI-NEXT: s_mov_b32 s11, 0xe8f000 ; G_SI-NEXT: s_add_u32 s8, s8, s3 -; G_SI-NEXT: s_load_dword s2, s[0:1], 0xb -; G_SI-NEXT: s_load_dword s3, s[0:1], 0xa -; G_SI-NEXT: s_load_dword s4, s[0:1], 0x9 ; G_SI-NEXT: s_addc_u32 s9, s9, 0 -; G_SI-NEXT: s_mov_b32 s0, 0 +; G_SI-NEXT: s_mov_b32 s2, 0 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: s_add_i32 s2, s2, 4 -; G_SI-NEXT: s_lshl_b32 s5, s2, 3 -; G_SI-NEXT: s_mov_b32 s1, 0x40450000 -; G_SI-NEXT: v_mov_b32_e32 v0, s0 -; G_SI-NEXT: v_mov_b32_e32 v2, s5 -; G_SI-NEXT: v_mov_b32_e32 v1, s1 +; G_SI-NEXT: s_add_i32 s4, s4, 4 +; G_SI-NEXT: s_mov_b32 s3, 0x40450000 +; G_SI-NEXT: s_load_dword s5, s[0:1], 0xa +; G_SI-NEXT: s_load_dword s0, s[0:1], 0x9 +; G_SI-NEXT: s_lshl_b32 s1, s4, 3 +; G_SI-NEXT: v_mov_b32_e32 v0, s2 +; G_SI-NEXT: v_mov_b32_e32 v2, s1 +; G_SI-NEXT: v_mov_b32_e32 v1, s3 ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] -; G_SI-NEXT: s_lshl_b32 s2, s2, 4 -; G_SI-NEXT: v_mov_b32_e32 v4, s2 +; G_SI-NEXT: s_lshl_b32 s1, s4, 4 +; G_SI-NEXT: v_mov_b32_e32 v4, s1 ; G_SI-NEXT: ds_min_rtn_f64 v[0:1], v4, v[0:1] ; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: v_mov_b32_e32 v0, s3 +; G_SI-NEXT: v_mov_b32_e32 v0, s5 ; G_SI-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3] -; G_SI-NEXT: v_mov_b32_e32 v2, s4 -; G_SI-NEXT: s_add_u32 s0, s4, 4 +; G_SI-NEXT: v_mov_b32_e32 v2, s0 +; G_SI-NEXT: s_add_u32 s0, s0, 4 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen ; G_SI-NEXT: s_waitcnt expcnt(0) @@ -799,31 +799,31 @@ ; G_GFX7-LABEL: lds_ds_fmin_f64: ; G_GFX7: ; %bb.0: ; G_GFX7-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; G_GFX7-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x9 -; G_GFX7-NEXT: s_load_dword s2, s[0:1], 0xb ; G_GFX7-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; G_GFX7-NEXT: s_load_dword s2, s[0:1], 0xb +; G_GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; G_GFX7-NEXT: s_mov_b32 s10, -1 ; G_GFX7-NEXT: s_mov_b32 s11, 0xe8f000 ; G_GFX7-NEXT: s_add_u32 s8, s8, s3 ; G_GFX7-NEXT: s_addc_u32 s9, s9, 0 ; G_GFX7-NEXT: s_mov_b32 s4, 0 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: s_add_i32 s0, s2, 4 +; G_GFX7-NEXT: s_add_i32 s2, s2, 4 ; G_GFX7-NEXT: s_mov_b32 s5, 0x40450000 -; G_GFX7-NEXT: s_lshl_b32 s1, s0, 3 ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 -; G_GFX7-NEXT: v_mov_b32_e32 v2, s1 +; G_GFX7-NEXT: s_lshl_b32 s3, s2, 3 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 +; G_GFX7-NEXT: v_mov_b32_e32 v2, s3 ; G_GFX7-NEXT: s_mov_b32 m0, -1 ; G_GFX7-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] -; G_GFX7-NEXT: s_lshl_b32 s0, s0, 4 -; G_GFX7-NEXT: v_mov_b32_e32 v4, s0 +; G_GFX7-NEXT: s_lshl_b32 s2, s2, 4 +; G_GFX7-NEXT: v_mov_b32_e32 v4, s2 ; G_GFX7-NEXT: ds_min_rtn_f64 v[0:1], v4, v[0:1] ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: v_mov_b32_e32 v0, s7 +; G_GFX7-NEXT: v_mov_b32_e32 v0, s1 ; G_GFX7-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3] -; G_GFX7-NEXT: v_mov_b32_e32 v2, s6 -; G_GFX7-NEXT: s_add_u32 s0, s6, 4 +; G_GFX7-NEXT: v_mov_b32_e32 v2, s0 +; G_GFX7-NEXT: s_add_u32 s0, s0, 4 ; G_GFX7-NEXT: v_mov_b32_e32 v3, s0 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen @@ -833,31 +833,31 @@ ; G_VI-LABEL: lds_ds_fmin_f64: ; G_VI: ; %bb.0: ; G_VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; G_VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 -; G_VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; G_VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; G_VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; G_VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; G_VI-NEXT: s_mov_b32 s90, -1 ; G_VI-NEXT: s_mov_b32 s91, 0xe80000 ; G_VI-NEXT: s_add_u32 s88, s88, s3 ; G_VI-NEXT: s_addc_u32 s89, s89, 0 ; G_VI-NEXT: s_mov_b32 s4, 0 ; G_VI-NEXT: s_waitcnt lgkmcnt(0) -; G_VI-NEXT: s_add_i32 s0, s2, 4 +; G_VI-NEXT: s_add_i32 s2, s2, 4 ; G_VI-NEXT: s_mov_b32 s5, 0x40450000 -; G_VI-NEXT: s_lshl_b32 s1, s0, 3 ; G_VI-NEXT: v_mov_b32_e32 v0, s4 -; G_VI-NEXT: v_mov_b32_e32 v2, s1 +; G_VI-NEXT: s_lshl_b32 s3, s2, 3 ; G_VI-NEXT: v_mov_b32_e32 v1, s5 +; G_VI-NEXT: v_mov_b32_e32 v2, s3 ; G_VI-NEXT: s_mov_b32 m0, -1 ; G_VI-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] -; G_VI-NEXT: s_lshl_b32 s0, s0, 4 -; G_VI-NEXT: v_mov_b32_e32 v4, s0 +; G_VI-NEXT: s_lshl_b32 s2, s2, 4 +; G_VI-NEXT: v_mov_b32_e32 v4, s2 ; G_VI-NEXT: ds_min_rtn_f64 v[0:1], v4, v[0:1] ; G_VI-NEXT: s_waitcnt lgkmcnt(0) -; G_VI-NEXT: v_mov_b32_e32 v0, s7 +; G_VI-NEXT: v_mov_b32_e32 v0, s1 ; G_VI-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3] -; G_VI-NEXT: v_mov_b32_e32 v2, s6 -; G_VI-NEXT: s_add_u32 s0, s6, 4 +; G_VI-NEXT: v_mov_b32_e32 v2, s0 +; G_VI-NEXT: s_add_u32 s0, s0, 4 ; G_VI-NEXT: v_mov_b32_e32 v3, s0 ; G_VI-NEXT: s_waitcnt lgkmcnt(0) ; G_VI-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen @@ -867,9 +867,9 @@ ; G_GFX9-LABEL: lds_ds_fmin_f64: ; G_GFX9: ; %bb.0: ; G_GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; G_GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 -; G_GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; G_GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; G_GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; G_GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 ; G_GFX9-NEXT: s_mov_b32 s10, -1 ; G_GFX9-NEXT: s_mov_b32 s11, 0xe00000 ; G_GFX9-NEXT: s_mov_b32 s0, 0 @@ -946,64 +946,64 @@ ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s11, 0xe8f000 ; SI-NEXT: s_add_u32 s8, s8, s3 -; SI-NEXT: s_load_dword s2, s[0:1], 0x9 -; SI-NEXT: s_load_dword s3, s[0:1], 0xa -; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dword s3, s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xa ; SI-NEXT: s_addc_u32 s9, s9, 0 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s1, s0, 4 -; SI-NEXT: s_lshl_b32 s0, s0, 3 -; SI-NEXT: s_add_i32 s4, s0, 32 -; SI-NEXT: s_add_i32 s5, s1, 64 +; SI-NEXT: s_lshl_b32 s0, s2, 3 +; SI-NEXT: s_lshl_b32 s5, s2, 4 +; SI-NEXT: s_add_i32 s2, s0, 32 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s1, 0x40450000 ; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v2, s2 ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] -; SI-NEXT: v_mov_b32_e32 v4, s5 +; SI-NEXT: s_add_i32 s0, s5, 64 +; SI-NEXT: v_mov_b32_e32 v4, s0 ; SI-NEXT: ds_max_f64 v4, v[0:1] -; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_waitcnt lgkmcnt(1) ; SI-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3] -; SI-NEXT: s_add_i32 s0, s2, 4 +; SI-NEXT: s_add_i32 s0, s3, 4 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen ; SI-NEXT: s_endpgm ; ; GFX7-LABEL: lds_ds_fmax_f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX7-NEXT: s_load_dword s0, s[0:1], 0xb ; GFX7-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s11, 0xe8f000 ; GFX7-NEXT: s_add_u32 s8, s8, s3 ; GFX7-NEXT: s_mov_b32 s2, 0 -; GFX7-NEXT: s_addc_u32 s9, s9, 0 ; GFX7-NEXT: s_mov_b32 s3, 0x40450000 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshl_b32 s1, s0, 4 -; GFX7-NEXT: s_lshl_b32 s0, s0, 3 ; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b32 s2, s4, 3 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32 -; GFX7-NEXT: v_mov_b32_e32 v4, s1 +; GFX7-NEXT: s_lshl_b32 s2, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-NEXT: ds_max_f64 v4, v[0:1] offset:64 -; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: s_waitcnt lgkmcnt(1) ; GFX7-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3] -; GFX7-NEXT: s_add_i32 s0, s4, 4 -; GFX7-NEXT: v_mov_b32_e32 v3, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_add_i32 s1, s0, 4 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_store_dword v1, v3, s[8:11], 0 offen ; GFX7-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen @@ -1012,31 +1012,31 @@ ; VI-LABEL: lds_ds_fmax_f64: ; VI: ; %bb.0: ; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s90, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s91, 0xe80000 ; VI-NEXT: s_add_u32 s88, s88, s3 ; VI-NEXT: s_mov_b32 s2, 0 -; VI-NEXT: s_addc_u32 s89, s89, 0 ; VI-NEXT: s_mov_b32 s3, 0x40450000 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s1, s0, 4 -; VI-NEXT: s_lshl_b32 s0, s0, 3 ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_addc_u32 s89, s89, 0 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshl_b32 s2, s4, 3 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32 -; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: s_lshl_b32 s2, s4, 4 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: ds_max_f64 v4, v[0:1] offset:64 -; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: s_waitcnt lgkmcnt(1) ; VI-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3] -; VI-NEXT: s_add_i32 s0, s4, 4 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_add_i32 s1, s0, 4 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; VI-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen @@ -1049,19 +1049,19 @@ ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000 ; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_mov_b32 s1, 0x40450000 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s5, s4, 4 -; GFX9-NEXT: s_lshl_b32 s4, s4, 3 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b32 s0, s4, 3 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: s_lshl_b32 s0, s4, 4 +; GFX9-NEXT: v_mov_b32_e32 v5, s0 ; GFX9-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NEXT: ds_max_f64 v5, v[0:1] offset:64 ; GFX9-NEXT: s_waitcnt lgkmcnt(1) @@ -1106,32 +1106,32 @@ ; G_SI-LABEL: lds_ds_fmax_f64: ; G_SI: ; %bb.0: ; G_SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; G_SI-NEXT: s_load_dword s4, s[0:1], 0xb ; G_SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; G_SI-NEXT: s_mov_b32 s10, -1 ; G_SI-NEXT: s_mov_b32 s11, 0xe8f000 ; G_SI-NEXT: s_add_u32 s8, s8, s3 -; G_SI-NEXT: s_load_dword s2, s[0:1], 0xb -; G_SI-NEXT: s_load_dword s3, s[0:1], 0xa -; G_SI-NEXT: s_load_dword s4, s[0:1], 0x9 ; G_SI-NEXT: s_addc_u32 s9, s9, 0 -; G_SI-NEXT: s_mov_b32 s0, 0 +; G_SI-NEXT: s_mov_b32 s2, 0 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: s_add_i32 s2, s2, 4 -; G_SI-NEXT: s_lshl_b32 s5, s2, 3 -; G_SI-NEXT: s_mov_b32 s1, 0x40450000 -; G_SI-NEXT: v_mov_b32_e32 v0, s0 -; G_SI-NEXT: v_mov_b32_e32 v2, s5 -; G_SI-NEXT: v_mov_b32_e32 v1, s1 +; G_SI-NEXT: s_add_i32 s4, s4, 4 +; G_SI-NEXT: s_mov_b32 s3, 0x40450000 +; G_SI-NEXT: s_load_dword s5, s[0:1], 0xa +; G_SI-NEXT: s_load_dword s0, s[0:1], 0x9 +; G_SI-NEXT: s_lshl_b32 s1, s4, 3 +; G_SI-NEXT: v_mov_b32_e32 v0, s2 +; G_SI-NEXT: v_mov_b32_e32 v2, s1 +; G_SI-NEXT: v_mov_b32_e32 v1, s3 ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] -; G_SI-NEXT: s_lshl_b32 s2, s2, 4 -; G_SI-NEXT: v_mov_b32_e32 v4, s2 +; G_SI-NEXT: s_lshl_b32 s1, s4, 4 +; G_SI-NEXT: v_mov_b32_e32 v4, s1 ; G_SI-NEXT: ds_max_rtn_f64 v[0:1], v4, v[0:1] ; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: v_mov_b32_e32 v0, s3 +; G_SI-NEXT: v_mov_b32_e32 v0, s5 ; G_SI-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3] -; G_SI-NEXT: v_mov_b32_e32 v2, s4 -; G_SI-NEXT: s_add_u32 s0, s4, 4 +; G_SI-NEXT: v_mov_b32_e32 v2, s0 +; G_SI-NEXT: s_add_u32 s0, s0, 4 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen ; G_SI-NEXT: s_waitcnt expcnt(0) @@ -1142,31 +1142,31 @@ ; G_GFX7-LABEL: lds_ds_fmax_f64: ; G_GFX7: ; %bb.0: ; G_GFX7-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; G_GFX7-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x9 -; G_GFX7-NEXT: s_load_dword s2, s[0:1], 0xb ; G_GFX7-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; G_GFX7-NEXT: s_load_dword s2, s[0:1], 0xb +; G_GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; G_GFX7-NEXT: s_mov_b32 s10, -1 ; G_GFX7-NEXT: s_mov_b32 s11, 0xe8f000 ; G_GFX7-NEXT: s_add_u32 s8, s8, s3 ; G_GFX7-NEXT: s_addc_u32 s9, s9, 0 ; G_GFX7-NEXT: s_mov_b32 s4, 0 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: s_add_i32 s0, s2, 4 +; G_GFX7-NEXT: s_add_i32 s2, s2, 4 ; G_GFX7-NEXT: s_mov_b32 s5, 0x40450000 -; G_GFX7-NEXT: s_lshl_b32 s1, s0, 3 ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 -; G_GFX7-NEXT: v_mov_b32_e32 v2, s1 +; G_GFX7-NEXT: s_lshl_b32 s3, s2, 3 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 +; G_GFX7-NEXT: v_mov_b32_e32 v2, s3 ; G_GFX7-NEXT: s_mov_b32 m0, -1 ; G_GFX7-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] -; G_GFX7-NEXT: s_lshl_b32 s0, s0, 4 -; G_GFX7-NEXT: v_mov_b32_e32 v4, s0 +; G_GFX7-NEXT: s_lshl_b32 s2, s2, 4 +; G_GFX7-NEXT: v_mov_b32_e32 v4, s2 ; G_GFX7-NEXT: ds_max_rtn_f64 v[0:1], v4, v[0:1] ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: v_mov_b32_e32 v0, s7 +; G_GFX7-NEXT: v_mov_b32_e32 v0, s1 ; G_GFX7-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3] -; G_GFX7-NEXT: v_mov_b32_e32 v2, s6 -; G_GFX7-NEXT: s_add_u32 s0, s6, 4 +; G_GFX7-NEXT: v_mov_b32_e32 v2, s0 +; G_GFX7-NEXT: s_add_u32 s0, s0, 4 ; G_GFX7-NEXT: v_mov_b32_e32 v3, s0 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen @@ -1176,31 +1176,31 @@ ; G_VI-LABEL: lds_ds_fmax_f64: ; G_VI: ; %bb.0: ; G_VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; G_VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 -; G_VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; G_VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; G_VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; G_VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; G_VI-NEXT: s_mov_b32 s90, -1 ; G_VI-NEXT: s_mov_b32 s91, 0xe80000 ; G_VI-NEXT: s_add_u32 s88, s88, s3 ; G_VI-NEXT: s_addc_u32 s89, s89, 0 ; G_VI-NEXT: s_mov_b32 s4, 0 ; G_VI-NEXT: s_waitcnt lgkmcnt(0) -; G_VI-NEXT: s_add_i32 s0, s2, 4 +; G_VI-NEXT: s_add_i32 s2, s2, 4 ; G_VI-NEXT: s_mov_b32 s5, 0x40450000 -; G_VI-NEXT: s_lshl_b32 s1, s0, 3 ; G_VI-NEXT: v_mov_b32_e32 v0, s4 -; G_VI-NEXT: v_mov_b32_e32 v2, s1 +; G_VI-NEXT: s_lshl_b32 s3, s2, 3 ; G_VI-NEXT: v_mov_b32_e32 v1, s5 +; G_VI-NEXT: v_mov_b32_e32 v2, s3 ; G_VI-NEXT: s_mov_b32 m0, -1 ; G_VI-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] -; G_VI-NEXT: s_lshl_b32 s0, s0, 4 -; G_VI-NEXT: v_mov_b32_e32 v4, s0 +; G_VI-NEXT: s_lshl_b32 s2, s2, 4 +; G_VI-NEXT: v_mov_b32_e32 v4, s2 ; G_VI-NEXT: ds_max_rtn_f64 v[0:1], v4, v[0:1] ; G_VI-NEXT: s_waitcnt lgkmcnt(0) -; G_VI-NEXT: v_mov_b32_e32 v0, s7 +; G_VI-NEXT: v_mov_b32_e32 v0, s1 ; G_VI-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3] -; G_VI-NEXT: v_mov_b32_e32 v2, s6 -; G_VI-NEXT: s_add_u32 s0, s6, 4 +; G_VI-NEXT: v_mov_b32_e32 v2, s0 +; G_VI-NEXT: s_add_u32 s0, s0, 4 ; G_VI-NEXT: v_mov_b32_e32 v3, s0 ; G_VI-NEXT: s_waitcnt lgkmcnt(0) ; G_VI-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen @@ -1210,9 +1210,9 @@ ; G_GFX9-LABEL: lds_ds_fmax_f64: ; G_GFX9: ; %bb.0: ; G_GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; G_GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 -; G_GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; G_GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; G_GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; G_GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 ; G_GFX9-NEXT: s_mov_b32 s10, -1 ; G_GFX9-NEXT: s_mov_b32 s11, 0xe00000 ; G_GFX9-NEXT: s_mov_b32 s0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll @@ -24,8 +24,8 @@ } ; GCN-LABEL: {{^}}class_f16_fabs: -; GCN: s_load_dword s[[SA_F16:[0-9]+]] ; GCN: s_load_dword s[[SB_I32:[0-9]+]] +; GCN: s_load_dword s[[SA_F16:[0-9]+]] ; GCN: v_mov_b32_e32 [[V_B_I32:v[0-9]+]], s[[SB_I32]] ; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |s[[SA_F16]]|, [[V_B_I32]] ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]] @@ -46,8 +46,8 @@ } ; GCN-LABEL: {{^}}class_f16_fneg: -; GCN: s_load_dword s[[SA_F16:[0-9]+]] ; GCN: s_load_dword s[[SB_I32:[0-9]+]] +; GCN: s_load_dword s[[SA_F16:[0-9]+]] ; GCN: v_mov_b32_e32 [[V_B_I32:v[0-9]+]], s[[SB_I32]] ; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -s[[SA_F16]], [[V_B_I32]] ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]] @@ -68,8 +68,8 @@ } ; GCN-LABEL: {{^}}class_f16_fabs_fneg: -; GCN: s_load_dword s[[SA_F16:[0-9]+]] ; GCN: s_load_dword s[[SB_I32:[0-9]+]] +; GCN: s_load_dword s[[SA_F16:[0-9]+]] ; GCN: v_mov_b32_e32 [[V_B_I32:v[0-9]+]], s[[SB_I32]] ; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|s[[SA_F16]]|, [[V_B_I32]] ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll @@ -19,25 +19,25 @@ ; ; VI-LABEL: s_cvt_pkrtz_v2f16_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, v0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s2, v0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_cvt_pkrtz_v2f16_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s4, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s2, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_cvt_pkrtz_v2f16_f32: @@ -69,19 +69,19 @@ ; ; VI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s4, s4 @@ -149,15 +149,15 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] glc +; VI-NEXT: flat_load_dword v5, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v1, v[2:3] glc +; VI-NEXT: flat_load_dword v2, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1 -; VI-NEXT: flat_store_dword v[4:5], v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32: @@ -225,13 +225,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] glc +; VI-NEXT: flat_load_dword v3, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, 1.0 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v3, 1.0 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: @@ -290,13 +290,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] glc +; VI-NEXT: flat_load_dword v3, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, 1.0, v0 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, 1.0, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: @@ -363,15 +363,15 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] glc +; VI-NEXT: flat_load_dword v5, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v1, v[2:3] glc +; VI-NEXT: flat_load_dword v2, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, v1 -; VI-NEXT: flat_store_dword v[4:5], v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: @@ -448,15 +448,15 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] glc +; VI-NEXT: flat_load_dword v5, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v1, v[2:3] glc +; VI-NEXT: flat_load_dword v2, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, -v1 -; VI-NEXT: flat_store_dword v[4:5], v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, -v2 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: @@ -533,15 +533,15 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] glc +; VI-NEXT: flat_load_dword v5, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v1, v[2:3] glc +; VI-NEXT: flat_load_dword v2, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, -v1 -; VI-NEXT: flat_store_dword v[4:5], v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, -v2 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: @@ -619,15 +619,15 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] glc +; VI-NEXT: flat_load_dword v5, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v1, v[2:3] glc +; VI-NEXT: flat_load_dword v2, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -|v0|, -v1 -; VI-NEXT: flat_store_dword v[4:5], v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, -|v5|, -v2 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll @@ -49,8 +49,8 @@ ; GCN-LABEL: {{^}}mad_f32_imm_c: ; GCN: v_mov_b32_e32 [[C:v[0-9]+]], 0x41000000 -; GCN: s_load_dword [[A:s[0-9]+]] ; GCN: s_load_dword [[B:s[0-9]+]] +; GCN: s_load_dword [[A:s[0-9]+]] ; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]] ; GCN: v_mac_f32_e32 [[C]], {{s[0-9]+}}, [[VB]]{{$}} define amdgpu_kernel void @mad_f32_imm_c( diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll @@ -3424,11 +3424,11 @@ ; VERDE-LABEL: image_load_mmo: ; VERDE: ; %bb.0: ; VERDE-NEXT: image_load v1, v[1:2], s[0:7] dmask:0x1 unorm -; VERDE-NEXT: v_mov_b32_e32 v3, 0 +; VERDE-NEXT: v_mov_b32_e32 v2, 0 ; VERDE-NEXT: s_mov_b32 m0, -1 -; VERDE-NEXT: ds_write_b32 v0, v3 +; VERDE-NEXT: ds_write_b32 v0, v2 ; VERDE-NEXT: v_add_i32_e32 v0, vcc, 16, v0 -; VERDE-NEXT: ds_write_b32 v0, v3 +; VERDE-NEXT: ds_write_b32 v0, v2 ; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: v_mov_b32_e32 v0, v1 ; VERDE-NEXT: s_waitcnt lgkmcnt(0) @@ -3437,9 +3437,9 @@ ; FIJI-LABEL: image_load_mmo: ; FIJI: ; %bb.0: ; FIJI-NEXT: image_load v1, v[1:2], s[0:7] dmask:0x1 unorm -; FIJI-NEXT: v_mov_b32_e32 v3, 0 +; FIJI-NEXT: v_mov_b32_e32 v2, 0 ; FIJI-NEXT: s_mov_b32 m0, -1 -; FIJI-NEXT: ds_write2_b32 v0, v3, v3 offset1:4 +; FIJI-NEXT: ds_write2_b32 v0, v2, v2 offset1:4 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: v_mov_b32_e32 v0, v1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) @@ -3448,8 +3448,8 @@ ; GFX6789-LABEL: image_load_mmo: ; GFX6789: ; %bb.0: ; GFX6789-NEXT: image_load v1, v[1:2], s[0:7] dmask:0x1 unorm -; GFX6789-NEXT: v_mov_b32_e32 v3, 0 -; GFX6789-NEXT: ds_write2_b32 v0, v3, v3 offset1:4 +; GFX6789-NEXT: v_mov_b32_e32 v2, 0 +; GFX6789-NEXT: ds_write2_b32 v0, v2, v2 offset1:4 ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: v_mov_b32_e32 v0, v1 ; GFX6789-NEXT: s_waitcnt lgkmcnt(0) @@ -3458,8 +3458,8 @@ ; NOPRT-LABEL: image_load_mmo: ; NOPRT: ; %bb.0: ; NOPRT-NEXT: image_load v1, v[1:2], s[0:7] dmask:0x1 unorm -; NOPRT-NEXT: v_mov_b32_e32 v3, 0 -; NOPRT-NEXT: ds_write2_b32 v0, v3, v3 offset1:4 +; NOPRT-NEXT: v_mov_b32_e32 v2, 0 +; NOPRT-NEXT: ds_write2_b32 v0, v2, v2 offset1:4 ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: v_mov_b32_e32 v0, v1 ; NOPRT-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll @@ -1718,8 +1718,8 @@ ; VERDE-NEXT: s_mov_b32 s15, 0xf000 ; VERDE-NEXT: s_mov_b32 s14, -1 ; VERDE-NEXT: s_waitcnt vmcnt(0) -; VERDE-NEXT: buffer_store_dword v10, off, s[12:15], 0 ; VERDE-NEXT: v_mov_b32_e32 v0, v9 +; VERDE-NEXT: buffer_store_dword v10, off, s[12:15], 0 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll @@ -39,8 +39,8 @@ } ; GCN-LABEL: {{^}}tbuffer_load_immoffs_large -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_SSCALED] offset:73 ; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 61 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] offset:4095 +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_SSCALED] offset:73 ; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] offset:1 ; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 61 format:[BUF_FMT_10_10_10_2_SSCALED] offset:4095 ; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_FMT_32_32_UINT] offset:73 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll @@ -5,16 +5,16 @@ define amdgpu_kernel void @set_inactive(i32 addrspace(1)* %out, i32 %in) { ; GCN-LABEL: set_inactive: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 42 ; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0 store i32 %tmp, i32 addrspace(1)* %out @@ -24,19 +24,19 @@ define amdgpu_kernel void @set_inactive_64(i64 addrspace(1)* %out, i64 %in) { ; GCN-LABEL: set_inactive_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0 store i64 %tmp, i64 addrspace(1)* %out @@ -46,34 +46,35 @@ define amdgpu_kernel void @set_inactive_scc(i32 addrspace(1)* %out, i32 %in, <4 x i32> inreg %desc) { ; GCN-LABEL: set_inactive_scc: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN-NEXT: s_load_dword s6, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_buffer_load_dword s0, s[0:3], 0x0 -; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_buffer_load_dword s3, s[4:7], 0x0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 42 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s0, 56 -; GCN-NEXT: s_mov_b64 s[0:1], -1 +; GCN-NEXT: s_cmp_lg_u32 s3, 56 +; GCN-NEXT: s_mov_b64 s[2:3], -1 ; GCN-NEXT: s_cbranch_scc1 .LBB2_3 ; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN-NEXT: s_cbranch_vccz .LBB2_4 ; GCN-NEXT: .LBB2_2: ; %.exit ; GCN-NEXT: s_endpgm ; GCN-NEXT: .LBB2_3: ; %.one ; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GCN-NEXT: s_mov_b64 s[2:3], 0 ; GCN-NEXT: s_cbranch_execnz .LBB2_2 ; GCN-NEXT: .LBB2_4: ; %.zero -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0) %cmp = icmp eq i32 %val, 56 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll @@ -42,8 +42,8 @@ ; GCN-LABEL: {{^}}tbuffer_load_immoffs_large ; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_SSCALED] idxen offset:73 ; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 61 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] idxen offset:4095 +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_SSCALED] idxen offset:73 ; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:1 ; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 61 format:[BUF_FMT_10_10_10_2_SSCALED] idxen offset:4095 ; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_FMT_32_32_UINT] idxen offset:73 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll @@ -33,8 +33,8 @@ } ; GCN-LABEL: {{^}}tbuffer_load_immoffs_large -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_SSCALED] offset:73 ; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 61 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] offset:4095 +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_SSCALED] offset:73 ; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] offset:1 ; GCN: s_waitcnt define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_large(<4 x i32> inreg, i32 inreg %soffs) { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll @@ -17,14 +17,14 @@ ; ; VI-LABEL: bfe_u32_arg_arg_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_bfe_u32 v0, v0, s1, s1 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_bfe_u32 v0, v0, s5, s5 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 %src1) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 @@ -47,15 +47,15 @@ ; ; VI-LABEL: bfe_u32_arg_arg_imm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0x7b -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_bfe_u32 v0, s0, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_bfe_u32 v0, s4, v0, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 123) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 @@ -78,15 +78,15 @@ ; ; VI-LABEL: bfe_u32_arg_imm_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7b -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_bfe_u32 v0, s0, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_bfe_u32 v0, s4, v0, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 123, i32 %src2) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 @@ -110,16 +110,16 @@ ; ; VI-LABEL: bfe_u32_imm_arg_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: s_movk_i32 s2, 0x7b -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_movk_i32 s6, 0x7b +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_bfe_u32 v0, s2, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_bfe_u32 v0, s6, v0, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 123, i32 %src1, i32 %src2) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 @@ -196,19 +196,19 @@ ; ; VI-LABEL: bfe_u32_zextload_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %load = load i8, i8 addrspace(1)* %in %ext = zext i8 %load to i32 @@ -240,21 +240,21 @@ ; ; VI-LABEL: bfe_u32_zext_in_reg_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 ; VI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %load = load i32, i32 addrspace(1)* %in, align 4 %add = add i32 %load, 1 @@ -286,21 +286,21 @@ ; ; VI-LABEL: bfe_u32_zext_in_reg_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 ; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %load = load i32, i32 addrspace(1)* %in, align 4 %add = add i32 %load, 1 @@ -333,22 +333,22 @@ ; ; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 ; VI-NEXT: v_and_b32_e32 v0, 0xfe, v0 ; VI-NEXT: v_bfe_u32 v0, v0, 1, 8 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %load = load i32, i32 addrspace(1)* %in, align 4 %add = add i32 %load, 1 @@ -381,22 +381,22 @@ ; ; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 ; VI-NEXT: v_and_b32_e32 v0, 0xf8, v0 ; VI-NEXT: v_bfe_u32 v0, v0, 3, 8 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %load = load i32, i32 addrspace(1)* %in, align 4 %add = add i32 %load, 1 @@ -429,22 +429,22 @@ ; ; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_7: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 ; VI-NEXT: v_and_b32_e32 v0, 0x80, v0 ; VI-NEXT: v_bfe_u32 v0, v0, 7, 8 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %load = load i32, i32 addrspace(1)* %in, align 4 %add = add i32 %load, 1 @@ -476,21 +476,21 @@ ; ; VI-LABEL: bfe_u32_zext_in_reg_i16_offset_8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 ; VI-NEXT: v_bfe_u32 v0, v0, 8, 8 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %load = load i32, i32 addrspace(1)* %in, align 4 %add = add i32 %load, 1 @@ -521,20 +521,20 @@ ; ; VI-LABEL: bfe_u32_test_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, 1, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 0, i32 1) @@ -645,20 +645,20 @@ ; ; VI-LABEL: bfe_u32_test_5: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_bfe_i32 v0, v0, 0, 1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 @@ -690,21 +690,21 @@ ; ; VI-LABEL: bfe_u32_test_6: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v0, 31, v0 ; VI-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 @@ -734,20 +734,20 @@ ; ; VI-LABEL: bfe_u32_test_7: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v0, 31, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 @@ -777,20 +777,20 @@ ; ; VI-LABEL: bfe_u32_test_8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, 1, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 @@ -820,20 +820,20 @@ ; ; VI-LABEL: bfe_u32_test_9: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v0, 31, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 31, i32 1) @@ -862,20 +862,20 @@ ; ; VI-LABEL: bfe_u32_test_10: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 1, i32 31) @@ -904,20 +904,20 @@ ; ; VI-LABEL: bfe_u32_test_11: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 8, i32 24) @@ -946,20 +946,20 @@ ; ; VI-LABEL: bfe_u32_test_12: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 24, i32 8) @@ -989,20 +989,20 @@ ; ; VI-LABEL: bfe_u32_test_13: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v0, 31, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 %shl = ashr i32 %x, 31 @@ -1558,23 +1558,26 @@ ; ; VI-LABEL: simplify_bfe_u32_multi_use_arg: ; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, 63, v0 ; VI-NEXT: v_bfe_u32 v1, v0, 2, 2 -; VI-NEXT: buffer_store_dword v1, off, s[8:11], 0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 { @@ -1601,14 +1604,14 @@ ; ; VI-LABEL: lshr_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_u32 s0, s0, 0x30006 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_bfe_u32 s4, s4, 0x30006 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %b = lshr i32 %a, 6 %c = and i32 %b, 7 @@ -1632,15 +1635,15 @@ ; ; VI-LABEL: v_lshr_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s0, s0, s1 -; VI-NEXT: s_and_b32 s0, s0, 7 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_lshr_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s4, s4, 7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %c = lshr i32 %a, %b %d = and i32 %c, 7 @@ -1663,14 +1666,14 @@ ; ; VI-LABEL: and_lshr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_u32 s0, s0, 0x30006 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_bfe_u32 s4, s4, 0x30006 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %b = and i32 %a, 448 %c = lshr i32 %b, 6 @@ -1693,14 +1696,14 @@ ; ; VI-LABEL: and_lshr2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_u32 s0, s0, 0x30006 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_bfe_u32 s4, s4, 0x30006 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %b = and i32 %a, 511 %c = lshr i32 %b, 6 @@ -1723,14 +1726,14 @@ ; ; VI-LABEL: shl_lshr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_u32 s0, s0, 0x150002 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_bfe_u32 s4, s4, 0x150002 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %b = shl i32 %a, 9 %c = lshr i32 %b, 11 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll @@ -7,24 +7,24 @@ define amdgpu_kernel void @cos_f16(half addrspace(1)* %r, half addrspace(1)* %a) { ; GFX6-LABEL: cos_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: s_mov_b32 s10, s2 -; GFX6-NEXT: s_mov_b32 s11, s3 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s10, s6 +; GFX6-NEXT: s_mov_b32 s11, s7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s2 +; GFX6-NEXT: s_mov_b32 s9, s3 ; GFX6-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x3e22f983, v0 ; GFX6-NEXT: v_fract_f32_e32 v0, v0 ; GFX6-NEXT: v_cos_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: cos_f16: @@ -75,33 +75,33 @@ define amdgpu_kernel void @cos_v2f16(<2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { ; GFX6-LABEL: cos_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: s_mov_b32 s10, s2 -; GFX6-NEXT: s_mov_b32 s11, s3 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s10, s6 +; GFX6-NEXT: s_mov_b32 s11, s7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s2 +; GFX6-NEXT: s_mov_b32 s9, s3 ; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX6-NEXT: s_mov_b32 s0, 0x3e22f983 -; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_mov_b32 s2, 0x3e22f983 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, s0, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, s2, v1 ; GFX6-NEXT: v_fract_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, s2, v0 ; GFX6-NEXT: v_fract_f32_e32 v0, v0 ; GFX6-NEXT: v_cos_f32_e32 v0, v0 ; GFX6-NEXT: v_cos_f32_e32 v1, v1 -; GFX6-NEXT: s_mov_b32 s0, s4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: cos_v2f16: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll @@ -126,8 +126,8 @@ ; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] ; VI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] -; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]] -; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]] +; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[C_F16_1]], v[[B_F16_1]], v[[A_F16_1]] +; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[C_V2_F16]], v[[B_V2_F16]], v[[A_V2_F16]] ; GFX9: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]] @@ -334,10 +334,10 @@ ; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_0:[0-9]+]], 16, v[[C_V4_F16_LO]] ; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V4_F16_HI]] -; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V4_F16_LO]], v[[B_V4_F16_LO]], v[[C_V4_F16_LO]] -; VI-DAG: v_fma_f16 v[[R1_F16_0:[0-9]+]], v[[A_F16_0]], v[[B_F16_0]], v[[C_F16_0]] -; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_V4_F16_HI]], v[[B_V4_F16_HI]], v[[C_V4_F16_HI]] -; VI-DAG: v_fma_f16 v[[R1_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]] +; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[C_V4_F16_LO]], v[[B_V4_F16_LO]], v[[A_V4_F16_LO]] +; VI-DAG: v_fma_f16 v[[R1_F16_0:[0-9]+]], v[[C_F16_0]], v[[B_F16_0]], v[[A_F16_0]] +; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[C_V4_F16_HI]], v[[B_V4_F16_HI]], v[[A_V4_F16_HI]] +; VI-DAG: v_fma_f16 v[[R1_F16_1:[0-9]+]], v[[C_F16_1]], v[[B_F16_1]], v[[A_F16_1]] ; SIVI-DAG: v_or_b32_e32 v[[R_V4_F16_LO:[0-9]+]], v[[R_F16_0]], v[[R1_F16_0]] ; SIVI-DAG: v_or_b32_e32 v[[R_V4_F16_HI:[0-9]+]], v[[R_F16_1]], v[[R1_F16_1]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll @@ -161,8 +161,8 @@ ; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] ; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] -; VI-DENORM-DAG: v_fma_f16 v[[RES0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]] -; VI-DENORM-DAG: v_fma_f16 v[[RES1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]] +; VI-DENORM-DAG: v_fma_f16 v[[RES0:[0-9]+]], v[[C_V2_F16]], v[[B_V2_F16]], v[[A_V2_F16]] +; VI-DENORM-DAG: v_fma_f16 v[[RES1:[0-9]+]], v[[C_F16_1]], v[[B_F16_1]], v[[A_F16_1]] ; VI-DENORM-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[RES1]] ; VI-DENORM-NOT: v_and_b32 ; VI-DENORM: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[RES0]], v[[R_F16_HI]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -127,61 +127,61 @@ define amdgpu_kernel void @maxnum_f16_imm_a( ; SI-LABEL: maxnum_f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: maxnum_f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_f16_e32 v0, v0, v0 ; VI-NEXT: v_max_f16_e32 v0, 0x4200, v0 -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: maxnum_f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s6, s2 -; GFX9-NEXT: s_mov_b32 s7, s3 -; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: v_max_f16_e32 v0, 0x4200, v0 -; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: maxnum_f16_imm_a: @@ -214,61 +214,61 @@ define amdgpu_kernel void @maxnum_f16_imm_b( ; SI-LABEL: maxnum_f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, 4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: maxnum_f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_f16_e32 v0, v0, v0 ; VI-NEXT: v_max_f16_e32 v0, 4.0, v0 -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: maxnum_f16_imm_b: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s6, s2 -; GFX9-NEXT: s_mov_b32 s7, s3 -; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: v_max_f16_e32 v0, 4.0, v0 -; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: maxnum_f16_imm_b: @@ -303,18 +303,18 @@ ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s6, s[6:7], 0x0 +; SI-NEXT: s_load_dword s2, s[6:7], 0x0 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s1, s6, 16 +; SI-NEXT: s_lshr_b32 s1, s2, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 ; SI-NEXT: s_lshr_b32 s0, s0, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s0 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -323,51 +323,49 @@ ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_max_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: maxnum_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_load_dword s4, s[6:7], 0x0 -; VI-NEXT: s_load_dword s5, s[8:9], 0x0 +; VI-NEXT: s_load_dword s8, s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v1, s4, s4 -; VI-NEXT: v_max_f16_e64 v0, s5, s5 -; VI-NEXT: s_lshr_b32 s5, s5, 16 -; VI-NEXT: s_lshr_b32 s4, s4, 16 +; VI-NEXT: v_max_f16_e64 v0, s8, s8 +; VI-NEXT: v_max_f16_e64 v1, s2, s2 +; VI-NEXT: s_lshr_b32 s0, s8, 16 ; VI-NEXT: v_max_f16_e32 v0, v1, v0 -; VI-NEXT: v_max_f16_e64 v1, s5, s5 -; VI-NEXT: v_max_f16_e64 v2, s4, s4 +; VI-NEXT: v_max_f16_e64 v1, s0, s0 +; VI-NEXT: s_lshr_b32 s0, s2, 16 +; VI-NEXT: v_max_f16_e64 v2, s0, s0 ; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: maxnum_v2f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s10, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s11, s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s11, s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v1, s10, s10 -; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 +; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 ; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -424,15 +422,13 @@ ; ; VI-LABEL: maxnum_v2f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x4400 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s4, s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_load_dword s4, s[6:7], 0x0 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_max_f16_e64 v0, s4, s4 ; VI-NEXT: s_lshr_b32 s4, s4, 16 ; VI-NEXT: v_max_f16_e64 v1, s4, s4 @@ -445,17 +441,15 @@ ; GFX9-LABEL: maxnum_v2f16_imm_a: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s0, 0x44004200 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s2, s2 -; GFX9-NEXT: v_pk_max_f16 v0, v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 +; GFX9-NEXT: s_mov_b32 s4, 0x44004200 +; GFX9-NEXT: v_pk_max_f16 v0, v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: maxnum_v2f16_imm_a: @@ -504,15 +498,13 @@ ; ; VI-LABEL: maxnum_v2f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x4200 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s4, s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_load_dword s4, s[6:7], 0x0 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_max_f16_e64 v0, s4, s4 ; VI-NEXT: s_lshr_b32 s4, s4, 16 ; VI-NEXT: v_max_f16_e64 v1, s4, s4 @@ -525,17 +517,15 @@ ; GFX9-LABEL: maxnum_v2f16_imm_b: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s0, 0x42004400 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s2, s2 -; GFX9-NEXT: v_pk_max_f16 v0, v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 +; GFX9-NEXT: s_mov_b32 s4, 0x42004400 +; GFX9-NEXT: v_pk_max_f16 v0, v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: maxnum_v2f16_imm_b: @@ -564,22 +554,21 @@ ; SI-LABEL: maxnum_v3f16: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 -; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s1, s6, 16 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 +; SI-NEXT: s_lshr_b32 s2, s2, 16 +; SI-NEXT: s_lshr_b32 s3, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_max_f32_e32 v2, v3, v2 @@ -592,58 +581,57 @@ ; SI-NEXT: v_max_f32_e32 v0, v0, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 +; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: maxnum_v3f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v1, s4, s4 -; VI-NEXT: v_max_f16_e64 v0, s6, s6 -; VI-NEXT: s_lshr_b32 s6, s6, 16 -; VI-NEXT: s_lshr_b32 s4, s4, 16 +; VI-NEXT: v_max_f16_e64 v0, s8, s8 +; VI-NEXT: v_max_f16_e64 v1, s2, s2 +; VI-NEXT: s_lshr_b32 s0, s8, 16 ; VI-NEXT: v_max_f16_e32 v0, v1, v0 -; VI-NEXT: v_max_f16_e64 v1, s6, s6 -; VI-NEXT: v_max_f16_e64 v2, s4, s4 +; VI-NEXT: v_max_f16_e64 v1, s0, s0 +; VI-NEXT: s_lshr_b32 s0, s2, 16 +; VI-NEXT: v_max_f16_e64 v2, s0, s0 ; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_max_f16_e64 v1, s7, s7 -; VI-NEXT: v_max_f16_e64 v2, s5, s5 +; VI-NEXT: v_max_f16_e64 v1, s9, s9 +; VI-NEXT: v_max_f16_e64 v2, s3, s3 ; VI-NEXT: v_max_f16_e32 v1, v2, v1 -; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: maxnum_v3f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v1, s4, s4 ; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v1, s12, s12 ; GFX9-NEXT: v_pk_max_f16 v2, s11, s11 ; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 +; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v2 ; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -684,39 +672,38 @@ ; SI-LABEL: maxnum_v4f16: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: s_lshr_b32 s5, s5, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 ; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: s_lshr_b32 s6, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; SI-NEXT: s_lshr_b32 s6, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 ; SI-NEXT: s_lshr_b32 s6, s5, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s4, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_max_f32_e32 v3, v3, v5 ; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_max_f32_e32 v1, v1, v5 ; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_max_f32_e32 v2, v2, v5 ; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_max_f32_e32 v0, v0, v4 @@ -731,54 +718,54 @@ ; ; VI-LABEL: maxnum_v4f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v1, s5, s5 -; VI-NEXT: v_max_f16_e64 v0, s7, s7 -; VI-NEXT: s_lshr_b32 s7, s7, 16 -; VI-NEXT: s_lshr_b32 s5, s5, 16 +; VI-NEXT: v_max_f16_e64 v0, s9, s9 +; VI-NEXT: v_max_f16_e64 v1, s3, s3 +; VI-NEXT: s_lshr_b32 s0, s9, 16 ; VI-NEXT: v_max_f16_e32 v0, v1, v0 -; VI-NEXT: v_max_f16_e64 v1, s7, s7 -; VI-NEXT: v_max_f16_e64 v2, s5, s5 +; VI-NEXT: v_max_f16_e64 v1, s0, s0 +; VI-NEXT: s_lshr_b32 s0, s3, 16 +; VI-NEXT: v_max_f16_e64 v2, s0, s0 ; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v1, v0, v1 -; VI-NEXT: v_max_f16_e64 v0, s6, s6 -; VI-NEXT: v_max_f16_e64 v2, s4, s4 -; VI-NEXT: s_lshr_b32 s5, s6, 16 -; VI-NEXT: s_lshr_b32 s4, s4, 16 +; VI-NEXT: v_max_f16_e64 v0, s8, s8 +; VI-NEXT: v_max_f16_e64 v2, s2, s2 +; VI-NEXT: s_lshr_b32 s0, s8, 16 ; VI-NEXT: v_max_f16_e32 v0, v2, v0 -; VI-NEXT: v_max_f16_e64 v2, s5, s5 -; VI-NEXT: v_max_f16_e64 v3, s4, s4 +; VI-NEXT: v_max_f16_e64 v2, s0, s0 +; VI-NEXT: s_lshr_b32 s0, s2, 16 +; VI-NEXT: v_max_f16_e64 v3, s0, s0 ; VI-NEXT: v_max_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v2 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: maxnum_v4f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 ; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 +; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 ; GFX9-NEXT: v_pk_max_f16 v2, s10, s10 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 +; GFX9-NEXT: v_pk_max_f16 v0, s12, s12 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v2 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -816,14 +803,12 @@ define amdgpu_kernel void @fmax_v4f16_imm_a( ; SI-LABEL: fmax_v4f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 ; SI-NEXT: s_lshr_b32 s5, s5, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 @@ -851,48 +836,48 @@ ; ; VI-LABEL: fmax_v4f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x4400 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v1, s5, s5 -; VI-NEXT: s_lshr_b32 s5, s5, 16 -; VI-NEXT: v_max_f16_e64 v3, s5, s5 -; VI-NEXT: v_max_f16_e64 v2, s4, s4 +; VI-NEXT: s_lshr_b32 s0, s3, 16 +; VI-NEXT: v_max_f16_e64 v1, s3, s3 +; VI-NEXT: v_max_f16_e64 v3, s0, s0 +; VI-NEXT: v_max_f16_e64 v2, s2, s2 ; VI-NEXT: v_max_f16_e32 v1, 0x4200, v1 ; VI-NEXT: v_max_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s4, s4, 16 +; VI-NEXT: s_lshr_b32 s0, s2, 16 ; VI-NEXT: v_or_b32_e32 v1, v1, v0 ; VI-NEXT: v_max_f16_e32 v0, 0x4800, v2 -; VI-NEXT: v_max_f16_e64 v2, s4, s4 +; VI-NEXT: v_max_f16_e64 v2, s0, s0 ; VI-NEXT: v_mov_b32_e32 v3, 0x4000 ; VI-NEXT: v_max_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v2 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fmax_v4f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s8, 0x44004200 ; GFX9-NEXT: s_mov_b32 s9, 0x40004800 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 -; GFX9-NEXT: v_pk_max_f16 v2, s6, s6 +; GFX9-NEXT: v_pk_max_f16 v0, s3, s3 +; GFX9-NEXT: v_pk_max_f16 v2, s2, s2 ; GFX9-NEXT: v_pk_max_f16 v1, v0, s8 ; GFX9-NEXT: v_pk_max_f16 v0, v2, s9 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: fmax_v4f16_imm_a: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -155,61 +155,61 @@ define amdgpu_kernel void @minnum_f16_imm_a( ; SI-LABEL: minnum_f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: minnum_f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_f16_e32 v0, v0, v0 ; VI-NEXT: v_min_f16_e32 v0, 0x4200, v0 -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: minnum_f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s6, s2 -; GFX9-NEXT: s_mov_b32 s7, s3 -; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: v_min_f16_e32 v0, 0x4200, v0 -; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: minnum_f16_imm_a: @@ -242,61 +242,61 @@ define amdgpu_kernel void @minnum_f16_imm_b( ; SI-LABEL: minnum_f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_min_f32_e32 v0, 4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: minnum_f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_f16_e32 v0, v0, v0 ; VI-NEXT: v_min_f16_e32 v0, 4.0, v0 -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: minnum_f16_imm_b: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s6, s2 -; GFX9-NEXT: s_mov_b32 s7, s3 -; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: v_min_f16_e32 v0, 4.0, v0 -; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: minnum_f16_imm_b: @@ -331,18 +331,18 @@ ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s6, s[6:7], 0x0 +; SI-NEXT: s_load_dword s2, s[6:7], 0x0 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s1, s6, 16 +; SI-NEXT: s_lshr_b32 s1, s2, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 ; SI-NEXT: s_lshr_b32 s0, s0, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s0 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -351,51 +351,49 @@ ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: minnum_v2f16_ieee: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_load_dword s4, s[6:7], 0x0 -; VI-NEXT: s_load_dword s5, s[8:9], 0x0 +; VI-NEXT: s_load_dword s8, s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v1, s4, s4 -; VI-NEXT: v_max_f16_e64 v0, s5, s5 -; VI-NEXT: s_lshr_b32 s5, s5, 16 -; VI-NEXT: s_lshr_b32 s4, s4, 16 +; VI-NEXT: v_max_f16_e64 v0, s8, s8 +; VI-NEXT: v_max_f16_e64 v1, s2, s2 +; VI-NEXT: s_lshr_b32 s0, s8, 16 ; VI-NEXT: v_min_f16_e32 v0, v1, v0 -; VI-NEXT: v_max_f16_e64 v1, s5, s5 -; VI-NEXT: v_max_f16_e64 v2, s4, s4 +; VI-NEXT: v_max_f16_e64 v1, s0, s0 +; VI-NEXT: s_lshr_b32 s0, s2, 16 +; VI-NEXT: v_max_f16_e64 v2, s0, s0 ; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: minnum_v2f16_ieee: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s10, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s11, s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s11, s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v1, s10, s10 -; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 +; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 ; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -487,15 +485,13 @@ ; ; VI-LABEL: minnum_v2f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x4400 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s4, s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_load_dword s4, s[6:7], 0x0 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_max_f16_e64 v0, s4, s4 ; VI-NEXT: s_lshr_b32 s4, s4, 16 ; VI-NEXT: v_max_f16_e64 v1, s4, s4 @@ -508,17 +504,15 @@ ; GFX9-LABEL: minnum_v2f16_imm_a: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s0, 0x44004200 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s2, s2 -; GFX9-NEXT: v_pk_min_f16 v0, v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 +; GFX9-NEXT: s_mov_b32 s4, 0x44004200 +; GFX9-NEXT: v_pk_min_f16 v0, v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: minnum_v2f16_imm_a: @@ -567,15 +561,13 @@ ; ; VI-LABEL: minnum_v2f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x4200 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s4, s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_load_dword s4, s[6:7], 0x0 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_max_f16_e64 v0, s4, s4 ; VI-NEXT: s_lshr_b32 s4, s4, 16 ; VI-NEXT: v_max_f16_e64 v1, s4, s4 @@ -588,17 +580,15 @@ ; GFX9-LABEL: minnum_v2f16_imm_b: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s0, 0x42004400 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s2, s2 -; GFX9-NEXT: v_pk_min_f16 v0, v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 +; GFX9-NEXT: s_mov_b32 s4, 0x42004400 +; GFX9-NEXT: v_pk_min_f16 v0, v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: minnum_v2f16_imm_b: @@ -627,22 +617,21 @@ ; SI-LABEL: minnum_v3f16: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 -; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s1, s6, 16 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 +; SI-NEXT: s_lshr_b32 s2, s2, 16 +; SI-NEXT: s_lshr_b32 s3, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_min_f32_e32 v2, v3, v2 @@ -655,58 +644,57 @@ ; SI-NEXT: v_min_f32_e32 v0, v0, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 +; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: minnum_v3f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v1, s4, s4 -; VI-NEXT: v_max_f16_e64 v0, s6, s6 -; VI-NEXT: s_lshr_b32 s6, s6, 16 -; VI-NEXT: s_lshr_b32 s4, s4, 16 +; VI-NEXT: v_max_f16_e64 v0, s8, s8 +; VI-NEXT: v_max_f16_e64 v1, s2, s2 +; VI-NEXT: s_lshr_b32 s0, s8, 16 ; VI-NEXT: v_min_f16_e32 v0, v1, v0 -; VI-NEXT: v_max_f16_e64 v1, s6, s6 -; VI-NEXT: v_max_f16_e64 v2, s4, s4 +; VI-NEXT: v_max_f16_e64 v1, s0, s0 +; VI-NEXT: s_lshr_b32 s0, s2, 16 +; VI-NEXT: v_max_f16_e64 v2, s0, s0 ; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_max_f16_e64 v1, s7, s7 -; VI-NEXT: v_max_f16_e64 v2, s5, s5 +; VI-NEXT: v_max_f16_e64 v1, s9, s9 +; VI-NEXT: v_max_f16_e64 v2, s3, s3 ; VI-NEXT: v_min_f16_e32 v1, v2, v1 -; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: minnum_v3f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v1, s4, s4 ; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v1, s12, s12 ; GFX9-NEXT: v_pk_max_f16 v2, s11, s11 ; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 +; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v2 ; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -747,39 +735,38 @@ ; SI-LABEL: minnum_v4f16: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: s_lshr_b32 s5, s5, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 ; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: s_lshr_b32 s6, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; SI-NEXT: s_lshr_b32 s6, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 ; SI-NEXT: s_lshr_b32 s6, s5, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s4, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_min_f32_e32 v3, v3, v5 ; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_min_f32_e32 v1, v1, v5 ; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_min_f32_e32 v2, v2, v5 ; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_min_f32_e32 v0, v0, v4 @@ -794,54 +781,54 @@ ; ; VI-LABEL: minnum_v4f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v1, s5, s5 -; VI-NEXT: v_max_f16_e64 v0, s7, s7 -; VI-NEXT: s_lshr_b32 s7, s7, 16 -; VI-NEXT: s_lshr_b32 s5, s5, 16 +; VI-NEXT: v_max_f16_e64 v0, s9, s9 +; VI-NEXT: v_max_f16_e64 v1, s3, s3 +; VI-NEXT: s_lshr_b32 s0, s9, 16 ; VI-NEXT: v_min_f16_e32 v0, v1, v0 -; VI-NEXT: v_max_f16_e64 v1, s7, s7 -; VI-NEXT: v_max_f16_e64 v2, s5, s5 +; VI-NEXT: v_max_f16_e64 v1, s0, s0 +; VI-NEXT: s_lshr_b32 s0, s3, 16 +; VI-NEXT: v_max_f16_e64 v2, s0, s0 ; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v1, v0, v1 -; VI-NEXT: v_max_f16_e64 v0, s6, s6 -; VI-NEXT: v_max_f16_e64 v2, s4, s4 -; VI-NEXT: s_lshr_b32 s5, s6, 16 -; VI-NEXT: s_lshr_b32 s4, s4, 16 +; VI-NEXT: v_max_f16_e64 v0, s8, s8 +; VI-NEXT: v_max_f16_e64 v2, s2, s2 +; VI-NEXT: s_lshr_b32 s0, s8, 16 ; VI-NEXT: v_min_f16_e32 v0, v2, v0 -; VI-NEXT: v_max_f16_e64 v2, s5, s5 -; VI-NEXT: v_max_f16_e64 v3, s4, s4 +; VI-NEXT: v_max_f16_e64 v2, s0, s0 +; VI-NEXT: s_lshr_b32 s0, s2, 16 +; VI-NEXT: v_max_f16_e64 v3, s0, s0 ; VI-NEXT: v_min_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v2 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: minnum_v4f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 ; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 +; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 ; GFX9-NEXT: v_pk_max_f16 v2, s10, s10 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 +; GFX9-NEXT: v_pk_max_f16 v0, s12, s12 ; GFX9-NEXT: v_pk_min_f16 v0, v0, v2 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -879,14 +866,12 @@ define amdgpu_kernel void @fmin_v4f16_imm_a( ; SI-LABEL: fmin_v4f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 ; SI-NEXT: s_lshr_b32 s5, s5, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 @@ -914,48 +899,48 @@ ; ; VI-LABEL: fmin_v4f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x4400 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v1, s5, s5 -; VI-NEXT: s_lshr_b32 s5, s5, 16 -; VI-NEXT: v_max_f16_e64 v3, s5, s5 -; VI-NEXT: v_max_f16_e64 v2, s4, s4 +; VI-NEXT: s_lshr_b32 s0, s3, 16 +; VI-NEXT: v_max_f16_e64 v1, s3, s3 +; VI-NEXT: v_max_f16_e64 v3, s0, s0 +; VI-NEXT: v_max_f16_e64 v2, s2, s2 ; VI-NEXT: v_min_f16_e32 v1, 0x4200, v1 ; VI-NEXT: v_min_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s4, s4, 16 +; VI-NEXT: s_lshr_b32 s0, s2, 16 ; VI-NEXT: v_or_b32_e32 v1, v1, v0 ; VI-NEXT: v_min_f16_e32 v0, 0x4800, v2 -; VI-NEXT: v_max_f16_e64 v2, s4, s4 +; VI-NEXT: v_max_f16_e64 v2, s0, s0 ; VI-NEXT: v_mov_b32_e32 v3, 0x4000 ; VI-NEXT: v_min_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v2 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fmin_v4f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s8, 0x44004200 ; GFX9-NEXT: s_mov_b32 s9, 0x40004800 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 -; GFX9-NEXT: v_pk_max_f16 v2, s6, s6 +; GFX9-NEXT: v_pk_max_f16 v0, s3, s3 +; GFX9-NEXT: v_pk_max_f16 v2, s2, s2 ; GFX9-NEXT: v_pk_min_f16 v1, v0, s8 ; GFX9-NEXT: v_pk_min_f16 v0, v2, s9 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: fmin_v4f16_imm_a: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -5,63 +5,63 @@ define amdgpu_kernel void @round_f64(double addrspace(1)* %out, double %x) #0 { ; SI-LABEL: round_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s3, 0xfffff -; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s1, 0xfffff ; SI-NEXT: v_mov_b32_e32 v4, 0x3ff00000 +; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014 -; SI-NEXT: s_add_i32 s5, s0, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s5 -; SI-NEXT: s_andn2_b64 s[2:3], s[10:11], s[0:1] -; SI-NEXT: s_and_b32 s0, s11, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s5, 0 +; SI-NEXT: s_bfe_u32 s0, s7, 0xb0014 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_add_i32 s4, s0, 0xfffffc01 +; SI-NEXT: s_mov_b32 s0, s10 +; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 +; SI-NEXT: s_andn2_b64 s[2:3], s[6:7], s[0:1] +; SI-NEXT: s_and_b32 s0, s7, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s4, 0 ; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_gt_i32 s5, 51 +; SI-NEXT: s_cmp_gt_i32 s4, 51 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] ; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; SI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] +; SI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] ; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v5, s7 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 ; SI-NEXT: v_bfi_b32 v4, s0, v4, v5 ; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc ; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s8 -; SI-NEXT: s_mov_b32 s5, s9 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: round_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_brev_b32 s8, -2 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_brev_b32 s5, -2 ; CI-NEXT: v_mov_b32_e32 v4, 0x3ff00000 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_trunc_f64_e32 v[0:1], s[6:7] -; CI-NEXT: v_mov_b32_e32 v5, s7 -; CI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] -; CI-NEXT: v_bfi_b32 v4, s8, v4, v5 +; CI-NEXT: v_trunc_f64_e32 v[0:1], s[2:3] +; CI-NEXT: v_mov_b32_e32 v5, s3 +; CI-NEXT: v_add_f64 v[2:3], s[2:3], -v[0:1] +; CI-NEXT: v_bfi_b32 v4, s5, v4, v5 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 ; CI-NEXT: v_mov_b32_e32 v2, 0 ; CI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc ; CI-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] -; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm %result = call double @llvm.round.f64(double %x) #1 store double %result, double addrspace(1)* %out @@ -71,23 +71,23 @@ define amdgpu_kernel void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: v_round_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[0:1], s[6:7] -; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_movk_i32 s6, 0xfc01 -; SI-NEXT: s_mov_b32 s0, -1 -; SI-NEXT: s_mov_b32 s1, 0xfffff -; SI-NEXT: s_brev_b32 s7, -2 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_movk_i32 s4, 0xfc01 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s3, 0xfffff +; SI-NEXT: s_brev_b32 s5, -2 ; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfe_u32 v4, v3, 20, 11 -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v4 -; SI-NEXT: v_lshr_b64 v[4:5], s[0:1], v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, s4, v4 +; SI-NEXT: v_lshr_b64 v[4:5], s[2:3], v6 ; SI-NEXT: v_and_b32_e32 v7, 0x80000000, v3 ; SI-NEXT: v_not_b32_e32 v4, v4 ; SI-NEXT: v_not_b32_e32 v5, v5 @@ -100,37 +100,37 @@ ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; SI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5] -; SI-NEXT: v_bfi_b32 v2, s7, v8, v3 +; SI-NEXT: v_bfi_b32 v2, s5, v8, v3 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc ; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] -; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; CI-LABEL: v_round_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[0:1], s[6:7] -; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 -; CI-NEXT: s_brev_b32 s0, -2 +; CI-NEXT: s_mov_b64 s[4:5], s[2:3] +; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 -; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_trunc_f64_e32 v[4:5], v[2:3] ; CI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5] -; CI-NEXT: v_bfi_b32 v2, s0, v8, v3 +; CI-NEXT: v_bfi_b32 v2, s2, v8, v3 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 +; CI-NEXT: s_mov_b64 s[2:3], s[6:7] ; CI-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc ; CI-NEXT: v_mov_b32_e32 v2, 0 ; CI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] -; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep = getelementptr double, double addrspace(1)* %in, i32 %tid @@ -144,15 +144,15 @@ define amdgpu_kernel void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 { ; SI-LABEL: round_v2f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_movk_i32 s7, 0xfc01 ; SI-NEXT: s_mov_b32 s3, 0xfffff -; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014 ; SI-NEXT: s_add_i32 s14, s0, s7 +; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s14 ; SI-NEXT: s_brev_b32 s15, 1 ; SI-NEXT: s_andn2_b64 s[12:13], s[10:11], s[0:1] @@ -210,30 +210,30 @@ ; ; CI-LABEL: round_v2f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd -; CI-NEXT: s_brev_b32 s6, -2 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: v_mov_b32_e32 v6, 0x3ff00000 -; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_trunc_f64_e32 v[0:1], s[2:3] -; CI-NEXT: v_mov_b32_e32 v4, s3 -; CI-NEXT: v_add_f64 v[2:3], s[2:3], -v[0:1] -; CI-NEXT: v_bfi_b32 v4, s6, v6, v4 +; CI-NEXT: v_trunc_f64_e32 v[0:1], s[6:7] +; CI-NEXT: v_mov_b32_e32 v4, s7 +; CI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] +; CI-NEXT: v_bfi_b32 v4, s2, v6, v4 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 ; CI-NEXT: v_mov_b32_e32 v2, 0 ; CI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc -; CI-NEXT: v_trunc_f64_e32 v[4:5], s[0:1] +; CI-NEXT: v_trunc_f64_e32 v[4:5], s[4:5] ; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] -; CI-NEXT: v_add_f64 v[0:1], s[0:1], -v[4:5] -; CI-NEXT: v_mov_b32_e32 v7, s1 +; CI-NEXT: v_add_f64 v[0:1], s[4:5], -v[4:5] +; CI-NEXT: v_mov_b32_e32 v7, s5 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5 -; CI-NEXT: v_bfi_b32 v6, s6, v6, v7 +; CI-NEXT: v_bfi_b32 v6, s2, v6, v7 ; CI-NEXT: v_cndmask_b32_e32 v1, 0, v6, vcc ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_add_f64 v[0:1], v[4:5], v[0:1] -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_endpgm %result = call <2 x double> @llvm.round.v2f64(<2 x double> %in) #1 store <2 x double> %result, <2 x double> addrspace(1)* %out @@ -244,9 +244,9 @@ ; SI-LABEL: round_v4f64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 -; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_movk_i32 s18, 0xfc01 +; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xfffff ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfe_u32 s0, s7, 0xb0014 @@ -630,70 +630,70 @@ ; ; CI-LABEL: round_v8f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx16 s[0:15], s[0:1], 0x19 -; CI-NEXT: s_brev_b32 s18, -2 +; CI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 +; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: v_mov_b32_e32 v16, 0x3ff00000 -; CI-NEXT: s_mov_b32 s19, 0xf000 +; CI-NEXT: s_load_dwordx2 s[20:21], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s23, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_trunc_f64_e32 v[0:1], s[2:3] -; CI-NEXT: v_mov_b32_e32 v4, s3 -; CI-NEXT: v_add_f64 v[2:3], s[2:3], -v[0:1] -; CI-NEXT: v_bfi_b32 v4, s18, v16, v4 +; CI-NEXT: v_trunc_f64_e32 v[0:1], s[6:7] +; CI-NEXT: v_mov_b32_e32 v4, s7 +; CI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] +; CI-NEXT: v_bfi_b32 v4, s2, v16, v4 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 ; CI-NEXT: v_mov_b32_e32 v2, 0 ; CI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc -; CI-NEXT: v_trunc_f64_e32 v[4:5], s[0:1] +; CI-NEXT: v_trunc_f64_e32 v[4:5], s[4:5] ; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] -; CI-NEXT: v_add_f64 v[0:1], s[0:1], -v[4:5] -; CI-NEXT: v_mov_b32_e32 v6, s1 +; CI-NEXT: v_add_f64 v[0:1], s[4:5], -v[4:5] +; CI-NEXT: v_mov_b32_e32 v6, s5 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5 -; CI-NEXT: v_bfi_b32 v6, s18, v16, v6 +; CI-NEXT: v_bfi_b32 v6, s2, v16, v6 ; CI-NEXT: v_cndmask_b32_e32 v1, 0, v6, vcc -; CI-NEXT: v_trunc_f64_e32 v[6:7], s[6:7] +; CI-NEXT: v_trunc_f64_e32 v[6:7], s[10:11] ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_add_f64 v[0:1], v[4:5], v[0:1] -; CI-NEXT: v_add_f64 v[4:5], s[6:7], -v[6:7] -; CI-NEXT: v_mov_b32_e32 v8, s7 +; CI-NEXT: v_add_f64 v[4:5], s[10:11], -v[6:7] +; CI-NEXT: v_mov_b32_e32 v8, s11 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 -; CI-NEXT: v_bfi_b32 v8, s18, v16, v8 +; CI-NEXT: v_bfi_b32 v8, s2, v16, v8 ; CI-NEXT: v_cndmask_b32_e32 v5, 0, v8, vcc -; CI-NEXT: v_trunc_f64_e32 v[8:9], s[4:5] +; CI-NEXT: v_trunc_f64_e32 v[8:9], s[8:9] ; CI-NEXT: v_mov_b32_e32 v4, 0 ; CI-NEXT: v_add_f64 v[6:7], v[6:7], v[4:5] -; CI-NEXT: v_add_f64 v[4:5], s[4:5], -v[8:9] -; CI-NEXT: v_mov_b32_e32 v10, s5 +; CI-NEXT: v_add_f64 v[4:5], s[8:9], -v[8:9] +; CI-NEXT: v_mov_b32_e32 v10, s9 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 -; CI-NEXT: v_bfi_b32 v10, s18, v16, v10 +; CI-NEXT: v_bfi_b32 v10, s2, v16, v10 ; CI-NEXT: v_cndmask_b32_e32 v5, 0, v10, vcc ; CI-NEXT: v_mov_b32_e32 v4, 0 ; CI-NEXT: v_add_f64 v[4:5], v[8:9], v[4:5] -; CI-NEXT: v_mov_b32_e32 v8, s11 -; CI-NEXT: v_bfi_b32 v18, s18, v16, v8 -; CI-NEXT: v_trunc_f64_e32 v[8:9], s[12:13] -; CI-NEXT: v_trunc_f64_e32 v[10:11], s[14:15] -; CI-NEXT: v_add_f64 v[14:15], s[12:13], -v[8:9] -; CI-NEXT: v_mov_b32_e32 v19, s15 +; CI-NEXT: v_mov_b32_e32 v8, s15 +; CI-NEXT: v_bfi_b32 v18, s2, v16, v8 +; CI-NEXT: v_trunc_f64_e32 v[8:9], s[16:17] +; CI-NEXT: v_trunc_f64_e32 v[10:11], s[18:19] +; CI-NEXT: v_add_f64 v[14:15], s[16:17], -v[8:9] +; CI-NEXT: v_mov_b32_e32 v19, s19 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5 -; CI-NEXT: v_add_f64 v[14:15], s[14:15], -v[10:11] -; CI-NEXT: v_mov_b32_e32 v17, s13 +; CI-NEXT: v_add_f64 v[14:15], s[18:19], -v[10:11] +; CI-NEXT: v_mov_b32_e32 v17, s17 ; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5 -; CI-NEXT: v_bfi_b32 v19, s18, v16, v19 -; CI-NEXT: v_trunc_f64_e32 v[12:13], s[8:9] -; CI-NEXT: v_bfi_b32 v17, s18, v16, v17 +; CI-NEXT: v_bfi_b32 v19, s2, v16, v19 +; CI-NEXT: v_trunc_f64_e32 v[12:13], s[12:13] +; CI-NEXT: v_bfi_b32 v17, s2, v16, v17 ; CI-NEXT: v_cndmask_b32_e64 v15, 0, v19, s[0:1] ; CI-NEXT: v_mov_b32_e32 v14, 0 ; CI-NEXT: v_add_f64 v[10:11], v[10:11], v[14:15] ; CI-NEXT: v_cndmask_b32_e32 v15, 0, v17, vcc ; CI-NEXT: v_mov_b32_e32 v14, 0 -; CI-NEXT: v_mov_b32_e32 v17, s9 +; CI-NEXT: v_mov_b32_e32 v17, s13 ; CI-NEXT: v_add_f64 v[8:9], v[8:9], v[14:15] -; CI-NEXT: v_add_f64 v[14:15], s[8:9], -v[12:13] -; CI-NEXT: v_bfi_b32 v19, s18, v16, v17 -; CI-NEXT: v_trunc_f64_e32 v[16:17], s[10:11] +; CI-NEXT: v_add_f64 v[14:15], s[12:13], -v[12:13] +; CI-NEXT: v_bfi_b32 v19, s2, v16, v17 +; CI-NEXT: v_trunc_f64_e32 v[16:17], s[14:15] ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5 -; CI-NEXT: v_add_f64 v[14:15], s[10:11], -v[16:17] -; CI-NEXT: s_mov_b32 s18, -1 +; CI-NEXT: v_add_f64 v[14:15], s[14:15], -v[16:17] +; CI-NEXT: s_mov_b32 s22, -1 ; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5 ; CI-NEXT: v_mov_b32_e32 v14, 0 ; CI-NEXT: v_cndmask_b32_e64 v15, 0, v18, s[0:1] @@ -701,10 +701,10 @@ ; CI-NEXT: v_cndmask_b32_e32 v17, 0, v19, vcc ; CI-NEXT: v_mov_b32_e32 v16, 0 ; CI-NEXT: v_add_f64 v[12:13], v[12:13], v[16:17] -; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:48 -; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[16:19], 0 offset:32 -; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16 -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[20:23], 0 offset:48 +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[20:23], 0 offset:32 +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16 +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 ; CI-NEXT: s_endpgm %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1 store <8 x double> %result, <8 x double> addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll @@ -7,24 +7,24 @@ define amdgpu_kernel void @sin_f16(half addrspace(1)* %r, half addrspace(1)* %a) { ; GFX6-LABEL: sin_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: s_mov_b32 s10, s2 -; GFX6-NEXT: s_mov_b32 s11, s3 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s10, s6 +; GFX6-NEXT: s_mov_b32 s11, s7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s2 +; GFX6-NEXT: s_mov_b32 s9, s3 ; GFX6-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x3e22f983, v0 ; GFX6-NEXT: v_fract_f32_e32 v0, v0 ; GFX6-NEXT: v_sin_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: sin_f16: @@ -75,33 +75,33 @@ define amdgpu_kernel void @sin_v2f16(<2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { ; GFX6-LABEL: sin_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: s_mov_b32 s10, s2 -; GFX6-NEXT: s_mov_b32 s11, s3 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s10, s6 +; GFX6-NEXT: s_mov_b32 s11, s7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s2 +; GFX6-NEXT: s_mov_b32 s9, s3 ; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX6-NEXT: s_mov_b32 s0, 0x3e22f983 -; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_mov_b32 s2, 0x3e22f983 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, s0, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, s2, v1 ; GFX6-NEXT: v_fract_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, s2, v0 ; GFX6-NEXT: v_fract_f32_e32 v0, v0 ; GFX6-NEXT: v_sin_f32_e32 v0, v0 ; GFX6-NEXT: v_sin_f32_e32 v1, v1 -; GFX6-NEXT: s_mov_b32 s0, s4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: sin_v2f16: diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -26,9 +26,9 @@ ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: flat_load_ushort v2, v[2:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_ushort v2, v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) @@ -37,19 +37,19 @@ ; ; GCN-NOHSA-VI-LABEL: constant_load_i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s7 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: constant_load_i16: @@ -100,26 +100,24 @@ ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: s_load_dword s0, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCN-HSA-NEXT: flat_store_dword v[0:1], v2 ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: constant_load_v2i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 -; GCN-NOHSA-VI-NEXT: s_load_dword s0, s[2:3], 0x0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 +; GCN-NOHSA-VI-NEXT: s_load_dword s4, s[2:3], 0x0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: constant_load_v2i16: @@ -180,17 +178,15 @@ ; GCN-NOHSA-VI-LABEL: constant_load_v3i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s3 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s2 -; GCN-NOHSA-VI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 -; GCN-NOHSA-VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NOHSA-VI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 +; GCN-NOHSA-VI-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: constant_load_v3i16: @@ -252,28 +248,26 @@ ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: constant_load_v4i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 -; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: constant_load_v4i16: @@ -317,32 +311,30 @@ ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: constant_load_v8i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: constant_load_v8i16: @@ -414,14 +406,12 @@ ; ; GCN-NOHSA-VI-LABEL: constant_load_v16i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s1 -; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x0 -; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 @@ -514,16 +504,17 @@ ; GCN-HSA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GCN-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[0:3] +; GCN-HSA-NEXT: s_waitcnt vmcnt(1) +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: constant_load_v16i16_align2: @@ -532,46 +523,53 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:2 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:4 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:14 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:10 ; GCN-NOHSA-VI-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:6 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v5, off, s[0:3], 0 offset:8 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:10 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v6, off, s[0:3], 0 offset:12 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v7, off, s[0:3], 0 offset:14 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v8, off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v9, off, s[0:3], 0 offset:18 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v10, off, s[0:3], 0 offset:20 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v11, off, s[0:3], 0 offset:22 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v12, off, s[0:3], 0 offset:24 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v13, off, s[0:3], 0 offset:26 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v14, off, s[0:3], 0 offset:28 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v15, off, s[0:3], 0 offset:30 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:2 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:30 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v5, off, s[0:3], 0 offset:26 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v6, off, s[0:3], 0 offset:22 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v7, off, s[0:3], 0 offset:18 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v8, off, s[0:3], 0 offset:12 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v9, off, s[0:3], 0 offset:8 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v10, off, s[0:3], 0 offset:4 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v11, off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v12, off, s[0:3], 0 offset:28 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v13, off, s[0:3], 0 offset:24 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v14, off, s[0:3], 0 offset:20 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v15, off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(14) -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v0, v18, v0 +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(13) +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v16, 16, v2 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(12) -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v1, v17, v4 +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(11) +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(10) -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v2, v16, v5 +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(9) +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v18, 16, v6 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(8) -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v3, v7, v6 +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v19, 16, v7 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(7) +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v3, v0, v8 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(6) -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v4, v9, v8 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v2, v1, v9 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(5) +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v1, v16, v10 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(4) -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v5, v11, v10 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v0, v17, v11 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v7, v4, v12 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(2) -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v6, v13, v12 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v6, v5, v13 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v5, v18, v14 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v7, v15, v14 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v4, v19, v15 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm @@ -623,9 +621,9 @@ ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: flat_load_ushort v2, v[2:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_ushort v2, v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) @@ -634,19 +632,19 @@ ; ; GCN-NOHSA-VI-LABEL: constant_zextload_i16_to_i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s7 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: constant_zextload_i16_to_i32: @@ -692,9 +690,9 @@ ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: flat_load_sshort v2, v[2:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_sshort v2, v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) @@ -703,19 +701,19 @@ ; ; GCN-NOHSA-VI-LABEL: constant_sextload_i16_to_i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s7 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_sshort v0, off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 +; GCN-NOHSA-VI-NEXT: buffer_load_sshort v0, off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: constant_sextload_i16_to_i32: @@ -762,9 +760,9 @@ ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: flat_load_ushort v2, v[2:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_ushort v2, v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) @@ -773,19 +771,19 @@ ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v1i16_to_v1i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s7 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: constant_zextload_v1i16_to_v1i32: @@ -831,9 +829,9 @@ ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: flat_load_sshort v2, v[2:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_sshort v2, v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) @@ -842,19 +840,19 @@ ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v1i16_to_v1i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s7 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_sshort v0, off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 +; GCN-NOHSA-VI-NEXT: buffer_load_sshort v0, off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: constant_sextload_v1i16_to_v1i32: @@ -913,18 +911,16 @@ ; GCN-NOHSA-VI-LABEL: constant_zextload_v2i16_to_v2i32: ; GCN-NOHSA-VI: ; %bb.0: ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 +; GCN-NOHSA-VI-NEXT: s_load_dword s4, s[2:3], 0x0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s2, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s2, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s4, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: constant_zextload_v2i16_to_v2i32: @@ -986,18 +982,16 @@ ; GCN-NOHSA-VI-LABEL: constant_sextload_v2i16_to_v2i32: ; GCN-NOHSA-VI: ; %bb.0: ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 +; GCN-NOHSA-VI-NEXT: s_load_dword s4, s[2:3], 0x0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s0, s2, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s1, s2 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s5, s4, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s4, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: constant_sextload_v2i16_to_v2i32: @@ -1049,39 +1043,39 @@ ; GCN-HSA-LABEL: constant_zextload_v3i16_to_v3i32: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s6, 0xffff +; GCN-HSA-NEXT: s_mov_b32 s4, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s1 -; GCN-HSA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_lshr_b32 s2, s0, 16 -; GCN-HSA-NEXT: s_and_b32 s1, s1, s6 -; GCN-HSA-NEXT: s_and_b32 s0, s0, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s2 +; GCN-HSA-NEXT: s_lshr_b32 s0, s2, 16 +; GCN-HSA-NEXT: s_and_b32 s1, s3, s4 +; GCN-HSA-NEXT: s_and_b32 s2, s2, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s1 ; GCN-HSA-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v3i16_to_v3i32: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, 0xffff -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, s8 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s4, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s3, s8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s1, s2, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: constant_zextload_v3i16_to_v3i32: @@ -1137,29 +1131,27 @@ ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s1 -; GCN-HSA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_ashr_i32 s2, s0, 16 -; GCN-HSA-NEXT: s_sext_i32_i16 s1, s1 -; GCN-HSA-NEXT: s_sext_i32_i16 s0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s2 +; GCN-HSA-NEXT: s_ashr_i32 s0, s2, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s1, s3 +; GCN-HSA-NEXT: s_sext_i32_i16 s2, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s1 ; GCN-HSA-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v3i16_to_v3i32: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s6, s4, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s5, s5 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s4, s4 @@ -1227,43 +1219,43 @@ ; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s6, 0xffff +; GCN-HSA-NEXT: s_mov_b32 s4, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_lshr_b32 s2, s1, 16 -; GCN-HSA-NEXT: s_lshr_b32 s3, s0, 16 -; GCN-HSA-NEXT: s_and_b32 s1, s1, s6 -; GCN-HSA-NEXT: s_and_b32 s0, s0, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s2 +; GCN-HSA-NEXT: s_lshr_b32 s0, s3, 16 +; GCN-HSA-NEXT: s_lshr_b32 s1, s2, 16 +; GCN-HSA-NEXT: s_and_b32 s3, s3, s4 +; GCN-HSA-NEXT: s_and_b32 s2, s2, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v4i16_to_v4i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, 0xffff -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s5, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, s8 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s4, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s6 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s3, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s3, s8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s3, s2, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: constant_zextload_v4i16_to_v4i32: @@ -1325,31 +1317,29 @@ ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_ashr_i32 s4, s0, 16 -; GCN-HSA-NEXT: s_ashr_i64 s[2:3], s[0:1], 48 -; GCN-HSA-NEXT: s_sext_i32_i16 s1, s1 -; GCN-HSA-NEXT: s_sext_i32_i16 s0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: s_ashr_i64 s[0:1], s[2:3], 48 +; GCN-HSA-NEXT: s_ashr_i32 s4, s2, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s1, s3 +; GCN-HSA-NEXT: s_sext_i32_i16 s2, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v4i16_to_v4i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s6, s5, 16 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s7, s4, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s5, s5 @@ -1432,66 +1422,66 @@ ; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s8, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GCN-HSA-NEXT: s_mov_b32 s2, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_lshr_b32 s9, s5, 16 -; GCN-HSA-NEXT: s_lshr_b32 s10, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s2, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s3, s6, 16 -; GCN-HSA-NEXT: s_and_b32 s5, s5, s8 -; GCN-HSA-NEXT: s_and_b32 s4, s4, s8 -; GCN-HSA-NEXT: s_and_b32 s7, s7, s8 -; GCN-HSA-NEXT: s_and_b32 s6, s6, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s2 +; GCN-HSA-NEXT: s_lshr_b32 s8, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s9, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s3, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s10, s6, 16 +; GCN-HSA-NEXT: s_and_b32 s5, s5, s2 +; GCN-HSA-NEXT: s_and_b32 s4, s4, s2 +; GCN-HSA-NEXT: s_and_b32 s7, s7, s2 +; GCN-HSA-NEXT: s_and_b32 s2, s6, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v8i16_to_v8i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, 0xffff -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, 0xffff +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s7, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, s8 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s6, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, s8 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s5, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, s8 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s4, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s9, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s9, s2 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s3, s8, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, s2 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s11, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, s2 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s10, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s10, s2 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16 +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: constant_zextload_v8i16_to_v8i32: @@ -1598,14 +1588,12 @@ ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v8i16_to_v8i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 -; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s10, s7, 16 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s11, s6, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s7, s7 @@ -1720,34 +1708,34 @@ ; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s12, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; GCN-HSA-NEXT: s_mov_b32 s2, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_lshr_b32 s13, s5, 16 -; GCN-HSA-NEXT: s_lshr_b32 s14, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s15, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s16, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s17, s9, 16 -; GCN-HSA-NEXT: s_lshr_b32 s18, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s2, s11, 16 -; GCN-HSA-NEXT: s_lshr_b32 s3, s10, 16 -; GCN-HSA-NEXT: s_and_b32 s5, s5, s12 -; GCN-HSA-NEXT: s_and_b32 s4, s4, s12 -; GCN-HSA-NEXT: s_and_b32 s7, s7, s12 -; GCN-HSA-NEXT: s_and_b32 s6, s6, s12 -; GCN-HSA-NEXT: s_and_b32 s9, s9, s12 -; GCN-HSA-NEXT: s_and_b32 s8, s8, s12 -; GCN-HSA-NEXT: s_and_b32 s11, s11, s12 -; GCN-HSA-NEXT: s_and_b32 s10, s10, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s2 +; GCN-HSA-NEXT: s_lshr_b32 s12, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s13, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s14, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s15, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s16, s9, 16 +; GCN-HSA-NEXT: s_lshr_b32 s17, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s3, s11, 16 +; GCN-HSA-NEXT: s_lshr_b32 s18, s10, 16 +; GCN-HSA-NEXT: s_and_b32 s5, s5, s2 +; GCN-HSA-NEXT: s_and_b32 s4, s4, s2 +; GCN-HSA-NEXT: s_and_b32 s7, s7, s2 +; GCN-HSA-NEXT: s_and_b32 s6, s6, s2 +; GCN-HSA-NEXT: s_and_b32 s9, s9, s2 +; GCN-HSA-NEXT: s_and_b32 s8, s8, s2 +; GCN-HSA-NEXT: s_and_b32 s11, s11, s2 +; GCN-HSA-NEXT: s_and_b32 s2, s10, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -1755,63 +1743,63 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v16i16_to_v16i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s12, 0xffff +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[14:15], 0x0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s14, 0xffff +; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s12 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s13 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s19, s11, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, s12 +; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, s14 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s10, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, s12 +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, s14 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s9, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, s12 +; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, s14 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s8, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, s12 +; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s20 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s19 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s7, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, s12 +; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, s14 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s6, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, s12 +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, s14 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s13, s5, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s5, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s18 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s17 -; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, s12 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s4, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, s12 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, s14 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s13, s4, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, s14 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 @@ -1821,9 +1809,9 @@ ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s13 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s12 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -1995,14 +1983,12 @@ ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v16i16_to_v16i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 -; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s18, s11, 16 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s19, s10, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s11, s11 @@ -2200,27 +2186,31 @@ ; ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s20, 0xffff +; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x0 +; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 +; GCN-HSA-NEXT: s_mov_b32 s18, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_and_b32 s21, s5, s20 -; GCN-HSA-NEXT: s_and_b32 s22, s4, s20 -; GCN-HSA-NEXT: s_and_b32 s23, s7, s20 -; GCN-HSA-NEXT: s_and_b32 s24, s6, s20 -; GCN-HSA-NEXT: s_and_b32 s25, s9, s20 -; GCN-HSA-NEXT: s_and_b32 s26, s8, s20 -; GCN-HSA-NEXT: s_and_b32 s27, s11, s20 -; GCN-HSA-NEXT: s_and_b32 s28, s10, s20 -; GCN-HSA-NEXT: s_and_b32 s29, s13, s20 -; GCN-HSA-NEXT: s_and_b32 s30, s12, s20 -; GCN-HSA-NEXT: s_and_b32 s31, s15, s20 -; GCN-HSA-NEXT: s_and_b32 s33, s14, s20 -; GCN-HSA-NEXT: s_and_b32 s34, s17, s20 -; GCN-HSA-NEXT: s_and_b32 s35, s16, s20 -; GCN-HSA-NEXT: s_and_b32 s36, s19, s20 -; GCN-HSA-NEXT: s_and_b32 s20, s18, s20 +; GCN-HSA-NEXT: s_and_b32 s19, s1, s18 +; GCN-HSA-NEXT: s_and_b32 s20, s0, s18 +; GCN-HSA-NEXT: s_and_b32 s21, s3, s18 +; GCN-HSA-NEXT: s_and_b32 s22, s2, s18 +; GCN-HSA-NEXT: s_and_b32 s23, s5, s18 +; GCN-HSA-NEXT: s_and_b32 s24, s4, s18 +; GCN-HSA-NEXT: s_and_b32 s25, s7, s18 +; GCN-HSA-NEXT: s_and_b32 s26, s6, s18 +; GCN-HSA-NEXT: s_and_b32 s27, s9, s18 +; GCN-HSA-NEXT: s_and_b32 s28, s8, s18 +; GCN-HSA-NEXT: s_and_b32 s29, s11, s18 +; GCN-HSA-NEXT: s_and_b32 s30, s10, s18 +; GCN-HSA-NEXT: s_and_b32 s31, s13, s18 +; GCN-HSA-NEXT: s_and_b32 s33, s12, s18 +; GCN-HSA-NEXT: s_and_b32 s34, s15, s18 +; GCN-HSA-NEXT: s_and_b32 s18, s14, s18 +; GCN-HSA-NEXT: s_lshr_b32 s35, s1, 16 +; GCN-HSA-NEXT: s_lshr_b32 s36, s0, 16 +; GCN-HSA-NEXT: s_lshr_b32 s3, s3, 16 +; GCN-HSA-NEXT: s_lshr_b32 s2, s2, 16 ; GCN-HSA-NEXT: s_lshr_b32 s5, s5, 16 ; GCN-HSA-NEXT: s_lshr_b32 s4, s4, 16 ; GCN-HSA-NEXT: s_lshr_b32 s7, s7, 16 @@ -2233,167 +2223,163 @@ ; GCN-HSA-NEXT: s_lshr_b32 s12, s12, 16 ; GCN-HSA-NEXT: s_lshr_b32 s15, s15, 16 ; GCN-HSA-NEXT: s_lshr_b32 s14, s14, 16 -; GCN-HSA-NEXT: s_lshr_b32 s17, s17, 16 -; GCN-HSA-NEXT: s_lshr_b32 s16, s16, 16 -; GCN-HSA-NEXT: s_lshr_b32 s19, s19, 16 -; GCN-HSA-NEXT: s_lshr_b32 s18, s18, 16 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s17 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s33 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s12 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s13 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s29 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s28 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s25 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s24 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s21 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v32i16_to_v32i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s20, 0xffff -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[20:23], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s19, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s18, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[4:19], s[6:7], 0x0 +; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[22:23], 0x0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s22, 0xffff +; GCN-NOHSA-VI-NEXT: s_mov_b32 s16, s20 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s17, s21 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s19, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s19, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s37, s18, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s18, s18, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s17, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s17, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s35, s16, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s16, s20 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s15, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s15, s22 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s37, s14, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, s22 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s13, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, s22 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s35, s12, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s36 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s31, s15, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s15, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s14, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, s20 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s29, s13, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s31, s11, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, s22 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s10, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, s22 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s29, s9, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s34 -; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s12, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, s20 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s27, s11, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, s22 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s8, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, s22 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s27, s7, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s31 -; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s10, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, s20 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s25, s9, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, s22 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s6, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, s22 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s25, s5, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s30 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s29 -; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s8, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, s20 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s23, s7, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, s22 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s4, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, s22 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s23, s3, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s27 -; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s6, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, s20 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s21, s5, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s3, s22 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s2, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, s22 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s1, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s26 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s25 -; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s4, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, s20 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s1, s22 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s21, s0, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s0, s22 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s24 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s23 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s22 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s21 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s20 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: constant_zextload_v32i16_to_v32i32: @@ -2700,14 +2686,12 @@ ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v32i16_to_v32i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s19, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s18, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s16, s0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s17, s1 -; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 -; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s35, s15, 16 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s36, s14, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s15, s15 @@ -3283,64 +3267,64 @@ ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v64i16_to_v64i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s26, 0xffff +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s40, 0xffff ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x40 -; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[36:51], s[18:19], 0x0 +; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x0 +; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x40 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s53, s1, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s37, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s37, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s36, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s21, s36, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s39, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s23, s39, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s38, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s25, s38, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s27, s41, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s41, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s29, s40, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s30, s40, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s31, s43, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s33, s43, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s42, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s35, s42, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s45, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s37, s45, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s44, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s39, s44, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s47, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s41, s47, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s46, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s43, s46, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s49, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s45, s49, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s48, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s47, s48, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s51, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s49, s51, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s51, s50, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s50, s50, s26 -; GCN-NOHSA-VI-NEXT: s_and_b32 s55, s0, s26 -; GCN-NOHSA-VI-NEXT: s_and_b32 s57, s3, s26 -; GCN-NOHSA-VI-NEXT: s_and_b32 s59, s2, s26 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s17, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s17, s40 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s16, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s16, s40 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s35, s19, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s19, s40 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s18, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s18, s18, s40 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s39, s21, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s21, s21, s40 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s41, s20, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s20, s20, s40 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s23, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s23, s23, s40 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s43, s22, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s22, s22, s40 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s25, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s25, s25, s40 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s45, s24, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s24, s24, s40 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s27, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s27, s27, s40 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s47, s26, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s26, s40 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s29, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s29, s29, s40 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s49, s28, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s28, s40 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s31, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s31, s31, s40 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s51, s30, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s30, s30, s40 +; GCN-NOHSA-VI-NEXT: s_and_b32 s53, s1, s40 +; GCN-NOHSA-VI-NEXT: s_and_b32 s55, s0, s40 +; GCN-NOHSA-VI-NEXT: s_and_b32 s57, s3, s40 +; GCN-NOHSA-VI-NEXT: s_and_b32 s59, s2, s40 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s5, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, s26 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, s40 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s61, s4, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, s26 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, s40 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s7, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, s26 +; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, s40 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s63, s6, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, s26 -; GCN-NOHSA-VI-NEXT: s_and_b32 s64, s9, s26 -; GCN-NOHSA-VI-NEXT: s_and_b32 s65, s8, s26 -; GCN-NOHSA-VI-NEXT: s_and_b32 s66, s11, s26 -; GCN-NOHSA-VI-NEXT: s_and_b32 s67, s10, s26 -; GCN-NOHSA-VI-NEXT: s_and_b32 s68, s13, s26 -; GCN-NOHSA-VI-NEXT: s_and_b32 s69, s12, s26 -; GCN-NOHSA-VI-NEXT: s_and_b32 s70, s15, s26 -; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s14, s26 +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, s40 +; GCN-NOHSA-VI-NEXT: s_and_b32 s64, s9, s40 +; GCN-NOHSA-VI-NEXT: s_and_b32 s65, s8, s40 +; GCN-NOHSA-VI-NEXT: s_and_b32 s66, s11, s40 +; GCN-NOHSA-VI-NEXT: s_and_b32 s67, s10, s40 +; GCN-NOHSA-VI-NEXT: s_and_b32 s68, s13, s40 +; GCN-NOHSA-VI-NEXT: s_and_b32 s69, s12, s40 +; GCN-NOHSA-VI-NEXT: s_and_b32 s70, s15, s40 +; GCN-NOHSA-VI-NEXT: s_and_b32 s40, s14, s40 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s15, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s14, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s1, 16 @@ -3351,9 +3335,9 @@ ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s12, 16 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s17 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s36 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s37 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s40 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s70 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15 @@ -3404,52 +3388,52 @@ ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s52 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s50 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s51 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s49 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s31 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s50 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s47 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s46 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s45 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s44 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s49 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s29 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s48 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s43 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s42 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s41 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s40 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s47 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s27 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s46 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s39 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s38 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s36 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s45 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s25 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s44 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s34 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s31 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s43 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s42 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s27 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s41 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s21 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s39 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s25 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s24 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s23 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s22 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s21 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s38 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -4028,9 +4012,10 @@ ; GCN-NOHSA-VI: ; %bb.0: ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x40 ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x0 +; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x40 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s49, s31, 16 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s69, s15, 16 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s70, s14, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s15, s15 @@ -4087,24 +4072,23 @@ ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s5, s5 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s4, s4 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s49, s31, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s50, s30, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s62 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s61 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s50, s30, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s31, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s60 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s31, s31 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s30, s30 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s58 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s56 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s57 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s30, s30 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s47, s29, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s54 @@ -4414,33 +4398,33 @@ ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: flat_load_ushort v2, v[2:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_ushort v0, v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN-HSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: constant_zextload_i16_to_i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: constant_zextload_i16_to_i64: @@ -4493,33 +4477,33 @@ ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: flat_load_sshort v2, v[2:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_sshort v0, v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-HSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-HSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: constant_sextload_i16_to_i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s7 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: constant_sextload_i16_to_i64: @@ -4569,33 +4553,33 @@ ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: flat_load_ushort v2, v[2:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_ushort v0, v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN-HSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v1i16_to_v1i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: constant_zextload_v1i16_to_v1i64: @@ -4643,33 +4627,33 @@ ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: flat_load_sshort v2, v[2:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_sshort v0, v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-HSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-HSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v1i16_to_v1i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s7 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: constant_sextload_v1i16_to_v1i64: @@ -4720,34 +4704,34 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_load_dword s0, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_lshr_b32 s1, s0, 16 -; GCN-HSA-NEXT: s_and_b32 s0, s0, 0xffff -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s1 +; GCN-HSA-NEXT: s_lshr_b32 s0, s2, 16 +; GCN-HSA-NEXT: s_and_b32 s1, s2, 0xffff +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v2i16_to_v2i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_load_dword s4, s[6:7], 0x0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-VI-NEXT: s_load_dword s2, s[2:3], 0x0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s4, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s2, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s1, s2, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: constant_zextload_v2i16_to_v2i64: @@ -4799,30 +4783,28 @@ ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_load_dword s0, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_lshr_b32 s2, s0, 16 -; GCN-HSA-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000 +; GCN-HSA-NEXT: s_lshr_b32 s0, s2, 16 ; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v2i16_to_v2i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NOHSA-VI-NEXT: s_load_dword s4, s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_load_dword s4, s[6:7], 0x0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x100000 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 @@ -4891,54 +4873,54 @@ ; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s6, 0xffff +; GCN-HSA-NEXT: s_mov_b32 s4, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_lshr_b32 s4, s3, 16 -; GCN-HSA-NEXT: s_lshr_b32 s5, s2, 16 -; GCN-HSA-NEXT: s_and_b32 s7, s2, s6 -; GCN-HSA-NEXT: s_and_b32 s2, s3, s6 +; GCN-HSA-NEXT: s_lshr_b32 s5, s3, 16 +; GCN-HSA-NEXT: s_lshr_b32 s6, s2, 16 +; GCN-HSA-NEXT: s_and_b32 s7, s2, s4 +; GCN-HSA-NEXT: s_and_b32 s2, s3, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v4i16_to_v4i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s5, s8 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s5, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s4, s8 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s2, s8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s1, s2, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s3, s8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s3, s3, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: constant_zextload_v4i16_to_v4i64: @@ -5039,14 +5021,12 @@ ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v4i16_to_v4i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s5 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s5, 16 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x100000 @@ -5149,23 +5129,23 @@ ; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s8, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GCN-HSA-NEXT: s_mov_b32 s2, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_lshr_b32 s9, s5, 16 -; GCN-HSA-NEXT: s_lshr_b32 s2, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s10, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s11, s4, 16 -; GCN-HSA-NEXT: s_and_b32 s4, s4, s8 -; GCN-HSA-NEXT: s_and_b32 s6, s6, s8 -; GCN-HSA-NEXT: s_and_b32 s5, s5, s8 -; GCN-HSA-NEXT: s_and_b32 s3, s7, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: s_lshr_b32 s8, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s3, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s9, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s10, s4, 16 +; GCN-HSA-NEXT: s_and_b32 s4, s4, s2 +; GCN-HSA-NEXT: s_and_b32 s6, s6, s2 +; GCN-HSA-NEXT: s_and_b32 s5, s5, s2 +; GCN-HSA-NEXT: s_and_b32 s2, s7, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 @@ -5176,17 +5156,17 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -5194,37 +5174,38 @@ ; GCN-NOHSA-VI-LABEL: constant_zextload_v8i16_to_v8i64: ; GCN-NOHSA-VI: ; %bb.0: ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, 0xffff ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s4, s8 -; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s5, s8 -; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s6, s8 -; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s7, s8 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s7, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s5, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s8, s6 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s8, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s9, s6 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s9, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s10, s6 +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s11, s6 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s11, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -5381,14 +5362,12 @@ ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v8i16_to_v8i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 -; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[6:7], 0x100000 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x100000 @@ -5548,31 +5527,31 @@ ; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s12, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; GCN-HSA-NEXT: s_mov_b32 s2, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_lshr_b32 s13, s5, 16 -; GCN-HSA-NEXT: s_lshr_b32 s14, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s15, s11, 16 -; GCN-HSA-NEXT: s_lshr_b32 s2, s9, 16 -; GCN-HSA-NEXT: s_lshr_b32 s16, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s17, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s18, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s19, s4, 16 -; GCN-HSA-NEXT: s_and_b32 s4, s4, s12 -; GCN-HSA-NEXT: s_and_b32 s6, s6, s12 -; GCN-HSA-NEXT: s_and_b32 s10, s10, s12 -; GCN-HSA-NEXT: s_and_b32 s8, s8, s12 -; GCN-HSA-NEXT: s_and_b32 s5, s5, s12 -; GCN-HSA-NEXT: s_and_b32 s7, s7, s12 -; GCN-HSA-NEXT: s_and_b32 s11, s11, s12 -; GCN-HSA-NEXT: s_and_b32 s3, s9, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: s_lshr_b32 s12, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s13, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s14, s11, 16 +; GCN-HSA-NEXT: s_lshr_b32 s3, s9, 16 +; GCN-HSA-NEXT: s_lshr_b32 s15, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s16, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s17, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s18, s4, 16 +; GCN-HSA-NEXT: s_and_b32 s4, s4, s2 +; GCN-HSA-NEXT: s_and_b32 s6, s6, s2 +; GCN-HSA-NEXT: s_and_b32 s10, s10, s2 +; GCN-HSA-NEXT: s_and_b32 s8, s8, s2 +; GCN-HSA-NEXT: s_and_b32 s5, s5, s2 +; GCN-HSA-NEXT: s_and_b32 s7, s7, s2 +; GCN-HSA-NEXT: s_and_b32 s11, s11, s2 +; GCN-HSA-NEXT: s_and_b32 s2, s9, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 @@ -5583,73 +5562,73 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s17 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v16i16_to_v16i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s12, 0xffff +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s4, s12 -; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s5, s12 -; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s6, s12 -; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s7, s12 -; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s8, s12 -; GCN-NOHSA-VI-NEXT: s_and_b32 s18, s9, s12 -; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s10, s12 -; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s11, s12 +; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[14:15], 0x0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s14, 0xffff +; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s12 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s13 +; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s4, s14 +; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s5, s14 +; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s6, s14 +; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s7, s14 +; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s8, s14 +; GCN-NOHSA-VI-NEXT: s_and_b32 s18, s9, s14 +; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s10, s14 +; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s11, s14 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s11, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s9, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 @@ -5674,11 +5653,11 @@ ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s13 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s13 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm @@ -5957,14 +5936,12 @@ ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v16i16_to_v16i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s1 -; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x0 -; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[6:7], 0x100000 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x100000 @@ -6242,252 +6219,252 @@ ; ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s20, 0xffff +; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x0 +; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 +; GCN-HSA-NEXT: s_mov_b32 s18, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_and_b32 s21, s4, s20 -; GCN-HSA-NEXT: s_and_b32 s22, s6, s20 -; GCN-HSA-NEXT: s_and_b32 s23, s8, s20 -; GCN-HSA-NEXT: s_and_b32 s24, s10, s20 -; GCN-HSA-NEXT: s_and_b32 s25, s12, s20 -; GCN-HSA-NEXT: s_and_b32 s26, s14, s20 -; GCN-HSA-NEXT: s_and_b32 s27, s16, s20 -; GCN-HSA-NEXT: s_and_b32 s28, s18, s20 -; GCN-HSA-NEXT: s_and_b32 s29, s5, s20 -; GCN-HSA-NEXT: s_and_b32 s30, s7, s20 -; GCN-HSA-NEXT: s_and_b32 s31, s9, s20 -; GCN-HSA-NEXT: s_and_b32 s33, s11, s20 -; GCN-HSA-NEXT: s_and_b32 s34, s13, s20 -; GCN-HSA-NEXT: s_and_b32 s35, s15, s20 -; GCN-HSA-NEXT: s_and_b32 s36, s17, s20 -; GCN-HSA-NEXT: s_and_b32 s20, s19, s20 +; GCN-HSA-NEXT: s_and_b32 s19, s0, s18 +; GCN-HSA-NEXT: s_and_b32 s20, s2, s18 +; GCN-HSA-NEXT: s_and_b32 s21, s4, s18 +; GCN-HSA-NEXT: s_and_b32 s22, s6, s18 +; GCN-HSA-NEXT: s_and_b32 s23, s8, s18 +; GCN-HSA-NEXT: s_and_b32 s24, s10, s18 +; GCN-HSA-NEXT: s_and_b32 s25, s12, s18 +; GCN-HSA-NEXT: s_and_b32 s26, s14, s18 +; GCN-HSA-NEXT: s_and_b32 s27, s1, s18 +; GCN-HSA-NEXT: s_and_b32 s28, s3, s18 +; GCN-HSA-NEXT: s_and_b32 s29, s5, s18 +; GCN-HSA-NEXT: s_and_b32 s30, s7, s18 +; GCN-HSA-NEXT: s_and_b32 s31, s9, s18 +; GCN-HSA-NEXT: s_and_b32 s33, s11, s18 +; GCN-HSA-NEXT: s_and_b32 s34, s13, s18 +; GCN-HSA-NEXT: s_and_b32 s18, s15, s18 +; GCN-HSA-NEXT: s_lshr_b32 s35, s1, 16 +; GCN-HSA-NEXT: s_lshr_b32 s3, s3, 16 ; GCN-HSA-NEXT: s_lshr_b32 s5, s5, 16 ; GCN-HSA-NEXT: s_lshr_b32 s7, s7, 16 ; GCN-HSA-NEXT: s_lshr_b32 s9, s9, 16 ; GCN-HSA-NEXT: s_lshr_b32 s11, s11, 16 ; GCN-HSA-NEXT: s_lshr_b32 s13, s13, 16 ; GCN-HSA-NEXT: s_lshr_b32 s15, s15, 16 -; GCN-HSA-NEXT: s_lshr_b32 s17, s17, 16 -; GCN-HSA-NEXT: s_lshr_b32 s19, s19, 16 -; GCN-HSA-NEXT: s_lshr_b32 s18, s18, 16 -; GCN-HSA-NEXT: s_lshr_b32 s16, s16, 16 ; GCN-HSA-NEXT: s_lshr_b32 s14, s14, 16 ; GCN-HSA-NEXT: s_lshr_b32 s12, s12, 16 ; GCN-HSA-NEXT: s_lshr_b32 s10, s10, 16 ; GCN-HSA-NEXT: s_lshr_b32 s8, s8, 16 ; GCN-HSA-NEXT: s_lshr_b32 s6, s6, 16 ; GCN-HSA-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s19 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s17 -; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s35 +; GCN-HSA-NEXT: s_lshr_b32 s2, s2, 16 +; GCN-HSA-NEXT: s_lshr_b32 s36, s0, 16 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xf0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xd0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x90 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s34 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s33 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s31 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s30 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s29 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s28 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xe0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s35 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xc0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xa0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s25 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x80 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s24 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s23 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s21 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v32i16_to_v32i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s20, 0xffff +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[20:23], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[4:19], s[6:7], 0x0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s19, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s18, -1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s21, s4, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s22, s5, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s23, s6, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s24, s7, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s25, s8, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s9, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s27, s10, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s11, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s29, s12, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s30, s13, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s31, s14, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s33, s15, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s34, s16, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s35, s17, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s36, s18, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s20, s19, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s19, s19, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s18, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s17, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s16, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[22:23], 0x0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s22, 0xffff +; GCN-NOHSA-VI-NEXT: s_mov_b32 s16, s20 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s17, s21 +; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NOHSA-VI-NEXT: s_and_b32 s20, s0, s22 +; GCN-NOHSA-VI-NEXT: s_and_b32 s21, s1, s22 +; GCN-NOHSA-VI-NEXT: s_and_b32 s23, s2, s22 +; GCN-NOHSA-VI-NEXT: s_and_b32 s24, s3, s22 +; GCN-NOHSA-VI-NEXT: s_and_b32 s25, s4, s22 +; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s5, s22 +; GCN-NOHSA-VI-NEXT: s_and_b32 s27, s6, s22 +; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s7, s22 +; GCN-NOHSA-VI-NEXT: s_and_b32 s29, s8, s22 +; GCN-NOHSA-VI-NEXT: s_and_b32 s30, s9, s22 +; GCN-NOHSA-VI-NEXT: s_and_b32 s31, s10, s22 +; GCN-NOHSA-VI-NEXT: s_and_b32 s33, s11, s22 +; GCN-NOHSA-VI-NEXT: s_and_b32 s34, s12, s22 +; GCN-NOHSA-VI-NEXT: s_and_b32 s35, s13, s22 +; GCN-NOHSA-VI-NEXT: s_and_b32 s36, s14, s22 +; GCN-NOHSA-VI-NEXT: s_and_b32 s22, s15, s22 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s15, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s14, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s13, s13, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s33 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s13, s13, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:240 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s12, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s31 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:224 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s11, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s35 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:208 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s29 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:192 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s9, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s33 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:176 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s27 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:160 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s7, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:144 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s25 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:128 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s5, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s27 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s3, s3, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s2, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s25 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s1, s1, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s0, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s21 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: constant_zextload_v32i16_to_v32i64: diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -29,9 +29,9 @@ ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: flat_load_ushort v2, v[2:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_ushort v2, v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) @@ -40,19 +40,19 @@ ; ; GCN-NOHSA-VI-LABEL: global_load_i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s7 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_load_i16: @@ -134,9 +134,9 @@ ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: flat_load_dword v2, v[2:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_dword v2, v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) @@ -145,19 +145,19 @@ ; ; GCN-NOHSA-VI-LABEL: global_load_v2i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s7 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 +; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_load_v2i16: @@ -236,20 +236,20 @@ ; ; GCN-NOHSA-VI-LABEL: global_load_v3i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 -; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 +; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_load_v3i16: @@ -351,30 +351,30 @@ ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN-HSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_load_v4i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s7 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_load_v4i16: @@ -447,19 +447,19 @@ ; ; GCN-NOHSA-VI-LABEL: global_load_v8i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s7 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_load_v8i16: @@ -525,41 +525,43 @@ ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 -; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-HSA-NEXT: s_waitcnt vmcnt(1) +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] +; GCN-HSA-NEXT: s_waitcnt vmcnt(1) +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_load_v16i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_load_v16i16: @@ -672,12 +674,12 @@ ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s0, s2, 16 @@ -686,63 +688,71 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 -; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-HSA-NEXT: s_waitcnt vmcnt(1) +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] +; GCN-HSA-NEXT: s_waitcnt vmcnt(1) +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_load_v16i16_align2: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:2 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:4 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:6 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v5, off, s[0:3], 0 offset:8 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:10 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v6, off, s[0:3], 0 offset:12 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v7, off, s[0:3], 0 offset:14 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v8, off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v9, off, s[0:3], 0 offset:18 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v10, off, s[0:3], 0 offset:20 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v11, off, s[0:3], 0 offset:22 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v12, off, s[0:3], 0 offset:24 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v13, off, s[0:3], 0 offset:26 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v14, off, s[0:3], 0 offset:28 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v15, off, s[0:3], 0 offset:30 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s7 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 offset:14 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:10 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:6 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v3, off, s[4:7], 0 offset:2 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v4, off, s[4:7], 0 offset:30 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v5, off, s[4:7], 0 offset:26 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v6, off, s[4:7], 0 offset:22 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v7, off, s[4:7], 0 offset:18 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v8, off, s[4:7], 0 offset:12 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v9, off, s[4:7], 0 offset:8 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v10, off, s[4:7], 0 offset:4 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v11, off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v12, off, s[4:7], 0 offset:28 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v13, off, s[4:7], 0 offset:24 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v14, off, s[4:7], 0 offset:20 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v15, off, s[4:7], 0 offset:16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s3 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(14) -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v0, v18, v0 +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(13) +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v16, 16, v2 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(12) -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v1, v17, v4 +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(11) +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(10) -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v2, v16, v5 +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(9) +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v18, 16, v6 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(8) -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v3, v7, v6 +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v19, 16, v7 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(7) +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v3, v0, v8 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(6) -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v4, v9, v8 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v2, v1, v9 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(5) +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v1, v16, v10 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(4) -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v5, v11, v10 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v0, v17, v11 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v7, v4, v12 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(2) -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v6, v13, v12 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v6, v5, v13 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v5, v18, v14 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v7, v15, v14 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v4, v19, v15 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_load_v16i16_align2: @@ -813,9 +823,9 @@ ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: flat_load_ushort v2, v[2:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_ushort v2, v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) @@ -824,19 +834,19 @@ ; ; GCN-NOHSA-VI-LABEL: global_zextload_i16_to_i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s7 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_i16_to_i32: @@ -898,9 +908,9 @@ ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: flat_load_sshort v2, v[2:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_sshort v2, v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) @@ -909,19 +919,19 @@ ; ; GCN-NOHSA-VI-LABEL: global_sextload_i16_to_i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s7 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_sshort v0, off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 +; GCN-NOHSA-VI-NEXT: buffer_load_sshort v0, off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_i16_to_i32: @@ -986,9 +996,9 @@ ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: flat_load_ushort v2, v[2:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_ushort v2, v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) @@ -997,19 +1007,19 @@ ; ; GCN-NOHSA-VI-LABEL: global_zextload_v1i16_to_v1i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s7 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v1i16_to_v1i32: @@ -1071,9 +1081,9 @@ ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: flat_load_sshort v2, v[2:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_sshort v2, v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) @@ -1082,19 +1092,19 @@ ; ; GCN-NOHSA-VI-LABEL: global_sextload_v1i16_to_v1i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s7 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_sshort v0, off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 +; GCN-NOHSA-VI-NEXT: buffer_load_sshort v0, off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v1i16_to_v1i32: @@ -1174,21 +1184,21 @@ ; ; GCN-NOHSA-VI-LABEL: global_zextload_v2i16_to_v2i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v2i16_to_v2i32: @@ -1273,21 +1283,21 @@ ; ; GCN-NOHSA-VI-LABEL: global_sextload_v2i16_to_v2i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v2i16_to_v2i32: @@ -1377,23 +1387,23 @@ ; ; GCN-NOHSA-VI-LABEL: global_zextload_v3i16_to_v3i32: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, 0xffff -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, 0xffff +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, s6, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, s2, v1 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s6, v0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s2, v0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v3i16_to_v3i32: @@ -1488,22 +1498,22 @@ ; ; GCN-NOHSA-VI-LABEL: global_sextload_v3i16_to_v3i32: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[3:4], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v3 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v4, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v3, 0, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v3i16_to_v3i32: @@ -1608,24 +1618,24 @@ ; ; GCN-NOHSA-VI-LABEL: global_zextload_v4i16_to_v4i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, 0xffff -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, 0xffff +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, s6, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, s2, v1 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s6, v0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s2, v0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v4i16_to_v4i32: @@ -1733,23 +1743,23 @@ ; ; GCN-NOHSA-VI-LABEL: global_sextload_v4i16_to_v4i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v5 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v4 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v5, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v4, 0, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v4i16_to_v4i32: @@ -1873,29 +1883,29 @@ ; ; GCN-NOHSA-VI-LABEL: global_zextload_v8i16_to_v8i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, 0xffff -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, 0xffff +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, s6, v3 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, s2, v3 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, s6, v2 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, s2, v2 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, s6, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, s2, v1 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, s6, v0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, s2, v0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v8i16_to_v8i32: @@ -2023,17 +2033,17 @@ ; ; GCN-NOHSA-VI-LABEL: global_sextload_v8i16_to_v8i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 16, v3 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 16, v2 @@ -2043,8 +2053,8 @@ ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 16, v0 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v1, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v0, 0, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v8i16_to_v8i32: @@ -2163,52 +2173,52 @@ ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_mov_b32 s4, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_mov_b32 s4, 0xffff -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v9, s4, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v7, s4, v6 -; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[7:10] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v8, s4, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s4, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v12, s4, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v10, s4, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s4, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s4, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[10:13] -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[6:9] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v10, s4, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v8, s4, v2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; GCN-HSA-NEXT: s_waitcnt vmcnt(1) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v10, s4, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v8, s4, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v12, s4, v6 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v3, s4, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v1, s4, v0 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v14, s4, v7 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[1:4] +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[5:6], v[8:11] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v16i16_to_v16i32: @@ -2407,8 +2417,8 @@ ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 @@ -2679,19 +2689,19 @@ ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s2, 48 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s2, 48 +; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 @@ -2705,8 +2715,10 @@ ; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s12, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s6 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v0 @@ -2732,40 +2744,39 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 -; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v13 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s14, v13 -; GCN-HSA-NEXT: v_and_b32_e32 v4, s14, v12 +; GCN-HSA-NEXT: s_waitcnt vmcnt(4) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v8 +; GCN-HSA-NEXT: v_and_b32_e32 v6, s14, v9 +; GCN-HSA-NEXT: v_and_b32_e32 v4, s14, v8 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v15 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v14 -; GCN-HSA-NEXT: v_and_b32_e32 v17, s14, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v15, s14, v14 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v10 +; GCN-HSA-NEXT: v_and_b32_e32 v18, s14, v11 +; GCN-HSA-NEXT: v_and_b32_e32 v16, s14, v10 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[15:18] +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] +; GCN-HSA-NEXT: s_waitcnt vmcnt(5) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v15 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v12 +; GCN-HSA-NEXT: v_and_b32_e32 v9, s14, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v17, s14, v13 +; GCN-HSA-NEXT: v_and_b32_e32 v15, s14, v12 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v9 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v17, s14, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v15, s14, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GCN-HSA-NEXT: v_and_b32_e32 v5, s14, v3 ; GCN-HSA-NEXT: v_and_b32_e32 v3, s14, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v11 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v10 -; GCN-HSA-NEXT: v_and_b32_e32 v13, s14, v11 -; GCN-HSA-NEXT: v_and_b32_e32 v11, s14, v10 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v14 +; GCN-HSA-NEXT: v_and_b32_e32 v7, s14, v14 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[11:14] -; GCN-HSA-NEXT: flat_store_dwordx4 v[7:8], v[3:6] +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[7:10] +; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[3:6] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v32i16_to_v32i32: @@ -3088,23 +3099,23 @@ ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 -; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 @@ -3621,194 +3632,195 @@ ; GCN-HSA-LABEL: global_zextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_movk_i32 s6, 0x50 -; GCN-HSA-NEXT: s_movk_i32 s7, 0x60 -; GCN-HSA-NEXT: s_movk_i32 s8, 0x70 -; GCN-HSA-NEXT: s_mov_b32 s9, 0xffff +; GCN-HSA-NEXT: s_movk_i32 s12, 0x50 +; GCN-HSA-NEXT: s_movk_i32 s13, 0x60 +; GCN-HSA-NEXT: s_movk_i32 s14, 0x70 +; GCN-HSA-NEXT: s_mov_b32 s15, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_add_u32 s4, s2, s6 -; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s2, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: s_add_u32 s4, s2, s12 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s2, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GCN-HSA-NEXT: s_add_u32 s4, s2, s13 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[2:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 +; GCN-HSA-NEXT: s_add_u32 s4, s2, s14 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 +; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: s_add_u32 s10, s2, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 -; GCN-HSA-NEXT: s_addc_u32 s11, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s2, 64 -; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 +; GCN-HSA-NEXT: s_add_u32 s6, s2, 32 +; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 ; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GCN-HSA-NEXT: s_add_u32 s8, s2, 48 +; GCN-HSA-NEXT: s_addc_u32 s9, s3, 0 +; GCN-HSA-NEXT: s_add_u32 s2, s2, 64 ; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 +; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[16:17] +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s8 ; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[20:21] +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s4 ; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[28:29] -; GCN-HSA-NEXT: flat_load_dwordx4 v[32:35], v[32:33] -; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v37, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v36, s0 +; GCN-HSA-NEXT: s_waitcnt vmcnt(6) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v26, s15, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v24, s15, v0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 +; GCN-HSA-NEXT: flat_load_dwordx4 v[32:35], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s10, s0, 0xf0 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xf0 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xc0 +; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xd0 +; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[24:27] +; GCN-HSA-NEXT: s_add_u32 s10, s0, 0xa0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v26, s15, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v24, s15, v2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] ; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s12, s0, 0xc0 -; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s14, s0, 0xd0 -; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s16, s0, 0xa0 -; GCN-HSA-NEXT: s_addc_u32 s17, s1, 0 -; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v5 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v26, s9, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v24, s9, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 -; GCN-HSA-NEXT: s_add_u32 s16, s0, 0xb0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[24:27] -; GCN-HSA-NEXT: s_addc_u32 s17, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v7 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v26, s9, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v24, s9, v6 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[24:27] +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s9, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v4, s9, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v5, s9, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v3, s9, v2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] -; GCN-HSA-NEXT: s_waitcnt vmcnt(9) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s15, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s15, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v7 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s15, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s15, v6 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GCN-HSA-NEXT: s_waitcnt vmcnt(9) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s9, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s9, v8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s15, v9 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s15, v8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v11 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s9, v11 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s9, v10 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s5 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s15, v11 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s15, v10 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s4 -; GCN-HSA-NEXT: s_waitcnt vmcnt(6) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v33 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v32 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s9, v33 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s9, v32 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v35 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v34 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s9, v35 -; GCN-HSA-NEXT: v_and_b32_e32 v4, s9, v34 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7] +; GCN-HSA-NEXT: s_waitcnt vmcnt(10) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v14 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v13 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v12 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s15, v13 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s15, v12 +; GCN-HSA-NEXT: v_and_b32_e32 v4, s15, v14 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v6, s15, v15 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GCN-HSA-NEXT: s_waitcnt vmcnt(11) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v17 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v16 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s15, v17 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s15, v16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v29 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v28 -; GCN-HSA-NEXT: v_and_b32_e32 v10, s9, v29 -; GCN-HSA-NEXT: v_and_b32_e32 v8, s9, v28 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, s7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v31 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s9, v31 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s9, v30 +; GCN-HSA-NEXT: s_add_u32 s2, s0, s13 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v19 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v18 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s15, v19 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s15, v18 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, s8 +; GCN-HSA-NEXT: s_add_u32 s2, s0, s14 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v21 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s9, v21 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s9, v20 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v23 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v22 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s9, v23 -; GCN-HSA-NEXT: v_and_b32_e32 v4, s9, v22 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v15 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v14 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GCN-HSA-NEXT: v_and_b32_e32 v2, s9, v15 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v13 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s9, v14 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s9, v13 -; GCN-HSA-NEXT: v_and_b32_e32 v4, s9, v12 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v16 -; GCN-HSA-NEXT: v_and_b32_e32 v14, s9, v17 -; GCN-HSA-NEXT: v_and_b32_e32 v12, s9, v16 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, s6 +; GCN-HSA-NEXT: s_add_u32 s2, s0, s12 +; GCN-HSA-NEXT: s_waitcnt vmcnt(10) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v33 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v32 +; GCN-HSA-NEXT: v_and_b32_e32 v14, s15, v33 +; GCN-HSA-NEXT: v_and_b32_e32 v12, s15, v32 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v21 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s15, v21 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s15, v20 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v19 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v23 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v22 +; GCN-HSA-NEXT: v_and_b32_e32 v6, s15, v23 +; GCN-HSA-NEXT: v_and_b32_e32 v4, s15, v22 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v18 -; GCN-HSA-NEXT: v_and_b32_e32 v10, s9, v19 -; GCN-HSA-NEXT: v_and_b32_e32 v8, s9, v18 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v35 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v34 +; GCN-HSA-NEXT: v_and_b32_e32 v10, s15, v35 +; GCN-HSA-NEXT: v_and_b32_e32 v8, s15, v34 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v29 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v28 +; GCN-HSA-NEXT: v_and_b32_e32 v6, s15, v29 +; GCN-HSA-NEXT: v_and_b32_e32 v4, s15, v28 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v31 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v30 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s15, v31 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s15, v30 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -3825,33 +3837,33 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s7 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s2 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:16 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[32:35], off, s[4:7], 0 offset:48 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, 0xffff ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(7) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v63, 16, v7 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v62, s4, v7 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v61, 16, v6 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v60, s4, v6 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v63, 16, v11 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v62, s4, v11 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v61, 16, v10 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v60, s4, v10 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, s4, v9 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, s4, v8 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v7 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, s4, v7 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s4, v6 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, s4, v5 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, s4, v4 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v23 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, s4, v23 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s4, v22 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, s4, v21 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, s4, v20 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(2) ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v39, 16, v27 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v38, s4, v27 @@ -3879,37 +3891,37 @@ ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v34, s4, v33 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, s4, v32 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v50, s4, v19 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v48, s4, v18 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v51, 16, v23 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v50, s4, v23 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v49, 16, v22 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v48, s4, v22 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, s4, v21 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, s4, v20 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v55, 16, v19 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v54, s4, v19 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v53, 16, v18 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v52, s4, v18 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, s4, v17 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, s4, v16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v54, s4, v15 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v52, s4, v14 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v59, 16, v15 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v58, s4, v15 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v57, 16, v14 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v56, s4, v14 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, s4, v13 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, s4, v12 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v59, 16, v11 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v58, s4, v11 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v57, 16, v10 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v56, s4, v10 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, s4, v9 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, s4, v8 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:224 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:160 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:128 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:144 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:96 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:112 @@ -3917,7 +3929,7 @@ ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:80 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -4403,70 +4415,66 @@ ; GCN-HSA-LABEL: global_sextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_movk_i32 s9, 0x70 -; GCN-HSA-NEXT: s_movk_i32 s10, 0x60 -; GCN-HSA-NEXT: s_movk_i32 s8, 0x50 +; GCN-HSA-NEXT: s_movk_i32 s7, 0x70 +; GCN-HSA-NEXT: s_movk_i32 s8, 0x60 +; GCN-HSA-NEXT: s_movk_i32 s6, 0x50 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_add_u32 s4, s2, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] +; GCN-HSA-NEXT: s_add_u32 s4, s2, s7 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s2, s10 -; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s2, s6 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 64 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: s_add_u32 s6, s2, 48 -; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s6 ; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[16:17] -; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[20:21] -; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 +; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: s_add_u32 s10, s2, 32 +; GCN-HSA-NEXT: s_addc_u32 s11, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s5 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2 -; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[28:29] -; GCN-HSA-NEXT: flat_load_dwordx4 v[32:35], v[32:33] -; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_waitcnt vmcnt(7) +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s11 +; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[20:21] +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v37, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v36, s0 +; GCN-HSA-NEXT: s_waitcnt vmcnt(5) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 16, v13 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 16, v12 ; GCN-HSA-NEXT: v_bfe_i32 v26, v13, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v24, v12, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 +; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[12:13] +; GCN-HSA-NEXT: flat_load_dwordx4 v[32:35], v[32:33] +; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[24:27] ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[24:27] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 16, v15 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 16, v14 ; GCN-HSA-NEXT: v_bfe_i32 v26, v15, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v24, v14, 0, 16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[24:27] ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v9 @@ -4475,51 +4483,56 @@ ; GCN-HSA-NEXT: v_bfe_i32 v12, v8, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[12:15] ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v11 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v10 ; GCN-HSA-NEXT: v_bfe_i32 v13, v11, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v11, v10, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[11:14] -; GCN-HSA-NEXT: s_waitcnt vmcnt(8) +; GCN-HSA-NEXT: s_waitcnt vmcnt(9) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v5 ; GCN-HSA-NEXT: v_bfe_i32 v10, v5, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v8, v4, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v6 -; GCN-HSA-NEXT: v_bfe_i32 v14, v7, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v12, v6, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[8:11] -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 16, v6 +; GCN-HSA-NEXT: v_bfe_i32 v9, v7, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 0, 16 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[7:10] +; GCN-HSA-NEXT: s_waitcnt vmcnt(10) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v1 ; GCN-HSA-NEXT: v_bfe_i32 v6, v1, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v2 -; GCN-HSA-NEXT: v_bfe_i32 v10, v3, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[4:7] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v3 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v2 +; GCN-HSA-NEXT: v_bfe_i32 v10, v3, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] @@ -4532,7 +4545,7 @@ ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, s10 +; GCN-HSA-NEXT: s_add_u32 s2, s0, s8 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v19 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v18 ; GCN-HSA-NEXT: v_bfe_i32 v2, v19, 0, 16 @@ -4541,7 +4554,7 @@ ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, s9 +; GCN-HSA-NEXT: s_add_u32 s2, s0, s7 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 @@ -4554,22 +4567,22 @@ ; GCN-HSA-NEXT: v_bfe_i32 v10, v23, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v8, v22, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, s8 +; GCN-HSA-NEXT: s_add_u32 s2, s0, s6 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] -; GCN-HSA-NEXT: s_waitcnt vmcnt(12) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v29 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v28 -; GCN-HSA-NEXT: v_bfe_i32 v14, v29, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v12, v28, 0, 16 +; GCN-HSA-NEXT: s_waitcnt vmcnt(11) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v33 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v32 +; GCN-HSA-NEXT: v_bfe_i32 v14, v33, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v12, v32, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v31 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v35 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v30 -; GCN-HSA-NEXT: v_bfe_i32 v10, v31, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v8, v30, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v34 +; GCN-HSA-NEXT: v_bfe_i32 v10, v35, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v34, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v21 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v20 @@ -4579,19 +4592,18 @@ ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_waitcnt vmcnt(14) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v33 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v32 -; GCN-HSA-NEXT: v_bfe_i32 v6, v33, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v4, v32, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v29 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v28 +; GCN-HSA-NEXT: v_bfe_i32 v6, v29, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v4, v28, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v35 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v31 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v34 -; GCN-HSA-NEXT: v_bfe_i32 v2, v35, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v34, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v30 +; GCN-HSA-NEXT: v_bfe_i32 v2, v31, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v0, v30, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -4613,43 +4625,43 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s7 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s2 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:16 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[32:35], off, s[4:7], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[36:39], off, s[4:7], 0 offset:48 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s89, s89, 0 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(6) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v63, 16, v17 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v63, 16, v13 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(5) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v59, 16, v21 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v59, 16, v17 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(4) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v55, 16, v25 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v55, 16, v21 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v11 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v10 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v11, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v10, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v27 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v26 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v27, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v26, 0, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v13 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v12 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v13, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v12, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 16, v9 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 16, v8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v9, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v8, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 16, v31 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 16, v30 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v31, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v30, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v9 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v9, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 16, v25 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 16, v24 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v25, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v24, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 16, v31 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v25, 16, v30 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v31, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v30, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v43, 16, v29 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v41, 16, v28 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v42, v29, 0, 16 @@ -4670,37 +4682,37 @@ ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v49, 16, v36 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v50, v37, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v48, v36, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v39, 16, v27 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v37, 16, v26 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v38, v27, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v36, v26, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v53, 16, v24 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v54, v25, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v52, v24, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v26, 16, v23 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v24, 16, v22 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v23, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v23, v22, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v57, 16, v20 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v58, v21, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v56, v20, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v39, 16, v23 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v37, 16, v22 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v38, v23, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v36, v22, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v53, 16, v20 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v54, v21, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v52, v20, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 16, v19 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 16, v18 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v19, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v18, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v61, 16, v16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v62, v17, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v60, v16, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v57, 16, v16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v58, v17, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v56, v16, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 16, v15 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 16, v14 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v15, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v14, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v61, 16, v12 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v62, v13, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v60, v12, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 16, v11 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 16, v10 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v11, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v10, 0, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:240 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:208 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:176 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:128 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:144 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:96 @@ -4708,7 +4720,7 @@ ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:64 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:80 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload ; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload @@ -5164,33 +5176,33 @@ ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: flat_load_ushort v2, v[2:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_ushort v0, v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN-HSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_i16_to_i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_i16_to_i64: @@ -5261,33 +5273,33 @@ ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: flat_load_sshort v2, v[2:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_sshort v0, v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-HSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-HSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_i16_to_i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s7 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_i16_to_i64: @@ -5356,33 +5368,33 @@ ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: flat_load_ushort v2, v[2:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_ushort v0, v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN-HSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v1i16_to_v1i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v1i16_to_v1i64: @@ -5448,33 +5460,33 @@ ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: flat_load_sshort v2, v[2:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_sshort v0, v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-HSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-HSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v1i16_to_v1i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s7 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 +; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v1i16_to_v1i64: @@ -5561,23 +5573,23 @@ ; ; GCN-NOHSA-VI-LABEL: global_zextload_v2i16_to_v2i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[8:11], 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v2i16_to_v2i64: @@ -5671,24 +5683,24 @@ ; ; GCN-NOHSA-VI-LABEL: global_sextload_v2i16_to_v2i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v1, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v2i16_to_v2i64: @@ -5799,29 +5811,29 @@ ; ; GCN-NOHSA-VI-LABEL: global_zextload_v4i16_to_v4i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[8:9], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, 0xffff +; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, s6, v9 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, s2, v9 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v9 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s6, v8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s2, v8 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v4i16_to_v4i64: @@ -5951,20 +5963,20 @@ ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-HSA-NEXT: s_endpgm -; -; GCN-NOHSA-VI-LABEL: global_sextload_v4i16_to_v4i64: -; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 +; +; GCN-NOHSA-VI-LABEL: global_sextload_v4i16_to_v4i64: +; GCN-NOHSA-VI: ; %bb.0: +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[1:2], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, v2 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 @@ -5977,8 +5989,8 @@ ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v4i16_to_v4i64: @@ -6576,13 +6588,13 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v10, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, v8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 @@ -6594,56 +6606,57 @@ ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2 -; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v5 +; GCN-HSA-NEXT: s_waitcnt vmcnt(1) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v1 +; GCN-HSA-NEXT: s_waitcnt vmcnt(1) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v5 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v7 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 ; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v7, s6, v7 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v11, s6, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v7, s6, v3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v11, s6, v6 ; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[7:10] ; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[11:14] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, v8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v11, s6, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v6, s6, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v14 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, v8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v11, s6, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s6, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s6, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s6, v0 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, v14 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[6:9] -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[11:14] +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[6:9] +; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[11:14] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v16i16_to_v16i64: @@ -6950,12 +6963,12 @@ ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[2:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 @@ -6973,63 +6986,63 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[4:5], 48 -; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[0:1], 48 +; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, v3 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, v7 +; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[2:3], 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[6:7], 48 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; GCN-HSA-NEXT: v_bfe_i32 v7, v5, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3 -; GCN-HSA-NEXT: v_bfe_i32 v5, v6, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[5:8] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 +; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-HSA-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v6, v6, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GCN-HSA-NEXT: v_bfe_i32 v10, v1, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] -; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] ; GCN-HSA-NEXT: s_waitcnt vmcnt(4) -; GCN-HSA-NEXT: v_ashr_i64 v[6:7], v[0:1], 48 -; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[4:5], 48 +; GCN-HSA-NEXT: v_bfe_i32 v0, v5, 0, 16 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, v7 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[4:7] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] ; GCN-HSA-NEXT: v_bfe_i32 v10, v8, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[2:3], 48 -; GCN-HSA-NEXT: v_bfe_i32 v0, v11, 0, 16 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GCN-HSA-NEXT: v_bfe_i32 v0, v4, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v6, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[6:7], v[6:7], 48 +; GCN-HSA-NEXT: v_bfe_i32 v4, v11, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 -; GCN-HSA-NEXT: v_bfe_i32 v6, v1, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-HSA-NEXT: v_bfe_i32 v2, v1, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v16i16_to_v16i64: @@ -7430,149 +7443,150 @@ ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s16, 0xffff -; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, v1 +; GCN-HSA-NEXT: s_mov_b32 s18, 0xffff +; GCN-HSA-NEXT: v_mov_b32_e32 v4, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, v4 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4 -; GCN-HSA-NEXT: flat_load_dwordx4 v[6:9], v[6:7] -; GCN-HSA-NEXT: flat_load_dwordx4 v[10:13], v[10:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s4 +; GCN-HSA-NEXT: flat_load_dwordx4 v[9:12], v[9:10] +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 -; GCN-HSA-NEXT: flat_load_dwordx4 v[14:17], v[14:15] -; GCN-HSA-NEXT: flat_load_dwordx4 v[18:21], v[18:19] -; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 +; GCN-HSA-NEXT: flat_load_dwordx4 v[13:16], v[13:14] +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[17:20], v[17:18] +; GCN-HSA-NEXT: s_add_u32 s4, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xf0 +; GCN-HSA-NEXT: s_add_u32 s6, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xd0 +; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xf0 ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s10, s0, 0xb0 +; GCN-HSA-NEXT: s_add_u32 s10, s0, 0xd0 ; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s12, s0, 0x90 +; GCN-HSA-NEXT: s_add_u32 s12, s0, 0xb0 ; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s14, s0, 0x70 -; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s14 -; GCN-HSA-NEXT: s_add_u32 s14, s0, 0x50 +; GCN-HSA-NEXT: s_add_u32 s14, s0, 0x90 ; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s16, s0, 0x70 +; GCN-HSA-NEXT: s_addc_u32 s17, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s17 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s16 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v9 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s15 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v7 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s14 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s11 -; GCN-HSA-NEXT: s_waitcnt vmcnt(4) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v13 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v13 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s10 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s13 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v11 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v11 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s12 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s5 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v5, s18, v3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[5:8] +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v5, s18, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[5:8] +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s13 ; GCN-HSA-NEXT: s_waitcnt vmcnt(4) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v19 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v19 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s4 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s7 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v17 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v17 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s6 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s9 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v15 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s8 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v7, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v18 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v18 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v12 +; GCN-HSA-NEXT: v_and_b32_e32 v5, s18, v12 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s12 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[5:8] +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s15 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v10 +; GCN-HSA-NEXT: v_and_b32_e32 v5, s18, v10 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s14 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[5:8] +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s7 +; GCN-HSA-NEXT: s_waitcnt vmcnt(5) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v14 +; GCN-HSA-NEXT: v_and_b32_e32 v5, s18, v14 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[5:8] +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: s_waitcnt vmcnt(5) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v20 +; GCN-HSA-NEXT: v_and_b32_e32 v5, s18, v20 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[5:8] +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s11 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v18 +; GCN-HSA-NEXT: v_and_b32_e32 v5, s18, v18 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s10 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[5:8] +; GCN-HSA-NEXT: v_mov_b32_e32 v10, 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v13 +; GCN-HSA-NEXT: v_and_b32_e32 v5, s18, v13 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xe0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, v7 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, v10 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[2:5] -; GCN-HSA-NEXT: v_and_b32_e32 v0, s16, v21 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v16 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v16 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[5:8] +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xc0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v19 +; GCN-HSA-NEXT: v_and_b32_e32 v5, s18, v19 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[2:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v14 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v14 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[5:8] +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xa0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v17 +; GCN-HSA-NEXT: v_and_b32_e32 v5, s18, v17 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[2:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v12 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[5:8] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v11 +; GCN-HSA-NEXT: v_and_b32_e32 v5, s18, v11 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x80 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[2:5] +; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[5:8] +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s7 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; GCN-HSA-NEXT: v_and_b32_e32 v5, s18, v9 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[5:8] +; GCN-HSA-NEXT: v_and_b32_e32 v11, s18, v0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v16 +; GCN-HSA-NEXT: v_and_b32_e32 v3, s18, v16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v7, s18, v15 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v10 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v10 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s6 -; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[2:5] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v21 -; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[0:3] -; GCN-HSA-NEXT: v_and_b32_e32 v9, s16, v6 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v20 -; GCN-HSA-NEXT: v_and_b32_e32 v4, s16, v20 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v17, s18, v2 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, v10 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v12, s16, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, v7 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, v1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[17:20] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v12, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, v10 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, v7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[9:12] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v32i16_to_v32i64: @@ -8129,23 +8143,23 @@ ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[2:3] +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 +; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 @@ -8154,8 +8168,8 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s4 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[4:5], 48 -; GCN-HSA-NEXT: v_bfe_i32 v16, v5, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[8:9], 48 +; GCN-HSA-NEXT: v_bfe_i32 v16, v9, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 @@ -8170,130 +8184,130 @@ ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s10, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, v7 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, v11 ; GCN-HSA-NEXT: s_add_u32 s12, s0, 0x50 -; GCN-HSA-NEXT: v_bfe_i32 v16, v5, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[6:7], 48 +; GCN-HSA-NEXT: v_bfe_i32 v16, v9, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[10:11], 48 ; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-HSA-NEXT: s_add_u32 s14, s0, 32 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v10 ; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] ; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 -; GCN-HSA-NEXT: v_bfe_i32 v18, v5, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v16, v6, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s14 +; GCN-HSA-NEXT: v_bfe_i32 v18, v9, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v16, v10, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s14 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s15 -; GCN-HSA-NEXT: flat_store_dwordx4 v[5:6], v[16:19] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s15 +; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[16:19] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 -; GCN-HSA-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v6, v5, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v8, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v10, v9, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[4:7] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s5 ; GCN-HSA-NEXT: s_waitcnt vmcnt(6) -; GCN-HSA-NEXT: v_ashr_i64 v[6:7], v[0:1], 48 -; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[0:1], 48 +; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s4 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, v3 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 -; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[6:7], v[2:3], 48 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[4:7] +; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[2:3], 48 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s9 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_ashr_i64 v[5:6], v[8:9], 48 -; GCN-HSA-NEXT: v_bfe_i32 v3, v9, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[4:5], 48 +; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s8 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, v11 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, v7 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[3:6] +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s6 -; GCN-HSA-NEXT: v_bfe_i32 v3, v1, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[5:6], v[10:11], 48 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[3:6] +; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[6:7], 48 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s13 ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_ashr_i64 v[5:6], v[12:13], 48 -; GCN-HSA-NEXT: v_bfe_i32 v3, v13, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[9:10], v[12:13], 48 +; GCN-HSA-NEXT: v_bfe_i32 v7, v13, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s12 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, v15 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, v15 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s11 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[3:6] -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 -; GCN-HSA-NEXT: v_bfe_i32 v3, v1, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[5:6], v[14:15], 48 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[7:10] ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s10 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GCN-HSA-NEXT: v_bfe_i32 v7, v3, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[9:10], v[14:15], 48 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[7:10] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[3:6] -; GCN-HSA-NEXT: v_bfe_i32 v1, v2, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v3, v7, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s2 +; GCN-HSA-NEXT: v_bfe_i32 v7, v2, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v9, v1, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[1:4] -; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v2, v5, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[1:2], v[7:10] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; GCN-HSA-NEXT: v_bfe_i32 v11, v4, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v17, v5, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_bfe_i32 v15, v6, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v17 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v10 +; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v2, v3, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[15:18] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v14 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_bfe_i32 v4, v14, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v13, v13, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v14 +; GCN-HSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3] +; GCN-HSA-NEXT: v_bfe_i32 v7, v14, 0, 16 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v12 ; GCN-HSA-NEXT: v_bfe_i32 v0, v12, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v12, v10, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v14, v15, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GCN-HSA-NEXT: v_bfe_i32 v8, v8, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: v_bfe_i32 v10, v11, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_bfe_i32 v6, v6, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; GCN-HSA-NEXT: v_bfe_i32 v9, v9, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[11:14] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v9 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[7:10] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -8304,18 +8318,18 @@ ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s7 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v16, 16, v14 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v16, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v14, 0, 16 @@ -8329,6 +8343,7 @@ ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(2) ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v1, 0, 16 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v14, 0, 16 diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll --- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll @@ -28,13 +28,13 @@ ; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_hi: ; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-DAG: ds_read_u16 [[LO:v[0-9]+]], v0 ; GFX900-DAG: ds_read_u16 [[HI:v[0-9]+]], v0 offset:16 +; GFX900-DAG: ds_read_u16 [[LO:v[0-9]+]], v0 ; GFX900-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; GFX900-DAG: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[LO]] -; GFX900-DAG: s_waitcnt lgkmcnt(0) +; GFX900-DAG: s_waitcnt lgkmcnt(1) ; GFX900-DAG: ds_write_b16 [[ZERO]], [[HI]] -; GFX900: v_lshl_or_b32 [[HI]], [[HI]], 16, [[AND]] +; GFX900: v_lshl_or_b32 v{{[0-9]+}}, [[HI]], 16, [[AND]] ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] define <2 x i16> @load_local_lo_hi_v2i16_multi_use_hi(i16 addrspace(3)* noalias %in) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll --- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll @@ -1036,10 +1036,10 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: flat_load_ubyte v0, v[0:1] -; GFX803-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1118,10 +1118,10 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: flat_load_ubyte v0, v[0:1] -; GFX803-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1602,11 +1602,11 @@ ; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8: ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:4094 glc +; GFX803-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) +; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 -; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1704,11 +1704,11 @@ ; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8: ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:4094 glc +; GFX803-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) +; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 -; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/load-local.128.ll --- a/llvm/test/CodeGen/AMDGPU/load-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local.128.ll @@ -23,10 +23,11 @@ ; GFX6-LABEL: load_lds_v4i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b64 v[2:3], v1 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v2 ; GFX6-NEXT: ds_read_b64 v[0:1], v0 +; GFX6-NEXT: ds_read_b64 v[2:3], v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -87,54 +88,51 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_u8 v1, v0 offset:7 -; GFX7-NEXT: ds_read_u8 v2, v0 offset:6 -; GFX7-NEXT: ds_read_u8 v3, v0 offset:5 -; GFX7-NEXT: ds_read_u8 v5, v0 offset:4 -; GFX7-NEXT: ds_read_u8 v4, v0 offset:3 -; GFX7-NEXT: ds_read_u8 v6, v0 offset:2 -; GFX7-NEXT: ds_read_u8 v7, v0 offset:1 -; GFX7-NEXT: ds_read_u8 v8, v0 -; GFX7-NEXT: s_waitcnt lgkmcnt(7) -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(3) +; GFX7-NEXT: ds_read_u8 v1, v0 offset:6 +; GFX7-NEXT: ds_read_u8 v2, v0 offset:4 +; GFX7-NEXT: ds_read_u8 v3, v0 offset:2 +; GFX7-NEXT: ds_read_u8 v4, v0 offset:1 +; GFX7-NEXT: ds_read_u8 v5, v0 +; GFX7-NEXT: ds_read_u8 v6, v0 offset:3 +; GFX7-NEXT: ds_read_u8 v7, v0 offset:5 +; GFX7-NEXT: ds_read_u8 v8, v0 offset:7 +; GFX7-NEXT: s_waitcnt lgkmcnt(4) ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: s_waitcnt lgkmcnt(3) +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: s_waitcnt lgkmcnt(2) -; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(1) -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v7 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v7 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX7-NEXT: ds_read_u8 v3, v0 offset:15 -; GFX7-NEXT: ds_read_u8 v5, v0 offset:14 -; GFX7-NEXT: ds_read_u8 v6, v0 offset:13 -; GFX7-NEXT: ds_read_u8 v7, v0 offset:12 -; GFX7-NEXT: ds_read_u8 v2, v0 offset:11 -; GFX7-NEXT: ds_read_u8 v8, v0 offset:10 -; GFX7-NEXT: ds_read_u8 v9, v0 offset:9 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v8 +; GFX7-NEXT: ds_read_u8 v5, v0 offset:15 +; GFX7-NEXT: ds_read_u8 v6, v0 offset:14 +; GFX7-NEXT: ds_read_u8 v7, v0 offset:13 +; GFX7-NEXT: ds_read_u8 v8, v0 offset:12 +; GFX7-NEXT: ds_read_u8 v9, v0 offset:11 +; GFX7-NEXT: ds_read_u8 v10, v0 offset:10 +; GFX7-NEXT: ds_read_u8 v11, v0 offset:9 ; GFX7-NEXT: ds_read_u8 v0, v0 offset:8 -; GFX7-NEXT: s_waitcnt lgkmcnt(7) -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX7-NEXT: s_waitcnt lgkmcnt(3) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX7-NEXT: s_waitcnt lgkmcnt(2) -; GFX7-NEXT: v_or_b32_e32 v2, v2, v8 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(1) -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v11 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v9 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v10 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v5 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v6 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v7 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v6 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, v4 @@ -151,30 +149,36 @@ ; GFX6-NEXT: v_add_i32_e32 v6, vcc, 8, v0 ; GFX6-NEXT: v_add_i32_e32 v7, vcc, 11, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: ds_read_u8 v1, v1 ; GFX6-NEXT: ds_read_u8 v2, v2 ; GFX6-NEXT: ds_read_u8 v3, v3 ; GFX6-NEXT: ds_read_u8 v4, v4 ; GFX6-NEXT: ds_read_u8 v5, v5 ; GFX6-NEXT: ds_read_u8 v6, v6 ; GFX6-NEXT: ds_read_u8 v7, v7 -; GFX6-NEXT: ds_read_u8 v1, v1 ; GFX6-NEXT: ds_read_u8 v8, v0 -; GFX6-NEXT: v_add_i32_e32 v9, vcc, 14, v0 -; GFX6-NEXT: v_add_i32_e32 v10, vcc, 3, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(1) +; GFX6-NEXT: s_waitcnt lgkmcnt(7) ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(6) ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: s_waitcnt lgkmcnt(5) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v3 +; GFX6-NEXT: s_waitcnt lgkmcnt(4) ; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(3) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v5 +; GFX6-NEXT: s_waitcnt lgkmcnt(2) ; GFX6-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX6-NEXT: s_waitcnt lgkmcnt(1) ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v7 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 10, v0 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, 13, v0 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, 12, v0 ; GFX6-NEXT: v_add_i32_e32 v7, vcc, 15, v0 +; GFX6-NEXT: v_add_i32_e32 v9, vcc, 14, v0 +; GFX6-NEXT: v_add_i32_e32 v10, vcc, 3, v0 ; GFX6-NEXT: v_add_i32_e32 v11, vcc, 2, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 1, v0 ; GFX6-NEXT: ds_read_u8 v4, v4 @@ -279,26 +283,27 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_u16 v3, v0 offset:14 -; GFX7-NEXT: ds_read_u16 v4, v0 offset:12 -; GFX7-NEXT: ds_read_u16 v2, v0 offset:10 -; GFX7-NEXT: ds_read_u16 v5, v0 offset:8 -; GFX7-NEXT: ds_read_u16 v1, v0 offset:6 -; GFX7-NEXT: ds_read_u16 v6, v0 offset:4 -; GFX7-NEXT: ds_read_u16 v7, v0 offset:2 -; GFX7-NEXT: ds_read_u16 v0, v0 -; GFX7-NEXT: s_waitcnt lgkmcnt(5) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: ds_read_u16 v3, v0 offset:12 +; GFX7-NEXT: ds_read_u16 v2, v0 offset:8 +; GFX7-NEXT: ds_read_u16 v1, v0 offset:4 +; GFX7-NEXT: ds_read_u16 v4, v0 offset:2 +; GFX7-NEXT: ds_read_u16 v5, v0 +; GFX7-NEXT: ds_read_u16 v6, v0 offset:6 +; GFX7-NEXT: ds_read_u16 v7, v0 offset:10 +; GFX7-NEXT: ds_read_u16 v8, v0 offset:14 +; GFX7-NEXT: s_waitcnt lgkmcnt(4) +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(3) -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(1) -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_or_b32_e32 v0, v7, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v6 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: load_lds_v4i32_align2: @@ -312,20 +317,25 @@ ; GFX6-NEXT: v_add_i32_e32 v6, vcc, 12, v0 ; GFX6-NEXT: v_add_i32_e32 v7, vcc, 2, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: ds_read_u16 v1, v1 ; GFX6-NEXT: ds_read_u16 v2, v2 ; GFX6-NEXT: ds_read_u16 v3, v3 ; GFX6-NEXT: ds_read_u16 v4, v4 ; GFX6-NEXT: ds_read_u16 v5, v5 ; GFX6-NEXT: ds_read_u16 v6, v6 ; GFX6-NEXT: ds_read_u16 v7, v7 -; GFX6-NEXT: ds_read_u16 v1, v1 ; GFX6-NEXT: ds_read_u16 v0, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(1) +; GFX6-NEXT: s_waitcnt lgkmcnt(7) ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(6) ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: s_waitcnt lgkmcnt(5) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX6-NEXT: s_waitcnt lgkmcnt(4) ; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX6-NEXT: s_waitcnt lgkmcnt(3) ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: s_waitcnt lgkmcnt(1) ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v6 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -386,8 +396,8 @@ ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v2, v2 ; GFX6-NEXT: ds_read_b32 v3, v3 -; GFX6-NEXT: ds_read_b32 v1, v1 ; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: ds_read_b32 v1, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -423,10 +433,11 @@ ; GFX6-LABEL: load_lds_v4i32_align8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b64 v[2:3], v1 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v2 ; GFX6-NEXT: ds_read_b64 v[0:1], v0 +; GFX6-NEXT: ds_read_b64 v[2:3], v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -460,10 +471,11 @@ ; GFX6-LABEL: load_lds_v4i32_align16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b64 v[2:3], v1 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v2 ; GFX6-NEXT: ds_read_b64 v[0:1], v0 +; GFX6-NEXT: ds_read_b64 v[2:3], v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/load-local.96.ll --- a/llvm/test/CodeGen/AMDGPU/load-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local.96.ll @@ -23,10 +23,11 @@ ; GFX6-LABEL: load_lds_v3i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v1 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v2 ; GFX6-NEXT: ds_read_b64 v[0:1], v0 +; GFX6-NEXT: ds_read_b32 v2, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -78,42 +79,41 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_u8 v1, v0 offset:7 -; GFX7-NEXT: ds_read_u8 v2, v0 offset:6 -; GFX7-NEXT: ds_read_u8 v4, v0 offset:5 -; GFX7-NEXT: ds_read_u8 v5, v0 offset:4 -; GFX7-NEXT: ds_read_u8 v3, v0 offset:3 -; GFX7-NEXT: ds_read_u8 v6, v0 offset:2 -; GFX7-NEXT: ds_read_u8 v7, v0 offset:1 -; GFX7-NEXT: ds_read_u8 v8, v0 -; GFX7-NEXT: s_waitcnt lgkmcnt(7) -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(5) -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: ds_read_u8 v1, v0 offset:6 +; GFX7-NEXT: ds_read_u8 v2, v0 offset:4 +; GFX7-NEXT: ds_read_u8 v3, v0 offset:2 +; GFX7-NEXT: ds_read_u8 v4, v0 offset:1 +; GFX7-NEXT: ds_read_u8 v5, v0 +; GFX7-NEXT: ds_read_u8 v6, v0 offset:3 +; GFX7-NEXT: ds_read_u8 v7, v0 offset:5 +; GFX7-NEXT: ds_read_u8 v8, v0 offset:7 ; GFX7-NEXT: s_waitcnt lgkmcnt(4) -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: ds_read_u8 v2, v0 offset:11 -; GFX7-NEXT: ds_read_u8 v4, v0 offset:10 -; GFX7-NEXT: ds_read_u8 v5, v0 offset:9 -; GFX7-NEXT: ds_read_u8 v0, v0 offset:8 -; GFX7-NEXT: s_waitcnt lgkmcnt(7) -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX7-NEXT: s_waitcnt lgkmcnt(5) -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(3) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX7-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(1) -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v7 +; GFX7-NEXT: ds_read_u8 v5, v0 offset:11 +; GFX7-NEXT: ds_read_u8 v6, v0 offset:10 +; GFX7-NEXT: ds_read_u8 v7, v0 offset:9 +; GFX7-NEXT: ds_read_u8 v0, v0 offset:8 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(4) +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v8 +; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(1) +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, v3 @@ -130,31 +130,37 @@ ; GFX6-NEXT: v_add_i32_e32 v6, vcc, 8, v0 ; GFX6-NEXT: v_add_i32_e32 v7, vcc, 11, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: ds_read_u8 v1, v1 ; GFX6-NEXT: ds_read_u8 v2, v2 ; GFX6-NEXT: ds_read_u8 v3, v3 ; GFX6-NEXT: ds_read_u8 v4, v4 ; GFX6-NEXT: ds_read_u8 v5, v5 ; GFX6-NEXT: ds_read_u8 v6, v6 ; GFX6-NEXT: ds_read_u8 v7, v7 -; GFX6-NEXT: ds_read_u8 v1, v1 ; GFX6-NEXT: ds_read_u8 v8, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(1) +; GFX6-NEXT: s_waitcnt lgkmcnt(7) ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(6) ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: s_waitcnt lgkmcnt(5) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v3 +; GFX6-NEXT: s_waitcnt lgkmcnt(4) ; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 10, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: ds_read_u8 v4, v4 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(4) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v5 +; GFX6-NEXT: s_waitcnt lgkmcnt(3) ; GFX6-NEXT: v_or_b32_e32 v2, v2, v6 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 10, v0 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, 3, v0 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, 2, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX6-NEXT: ds_read_u8 v4, v4 ; GFX6-NEXT: ds_read_u8 v5, v5 ; GFX6-NEXT: ds_read_u8 v6, v6 ; GFX6-NEXT: ds_read_u8 v0, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(5) ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v7 ; GFX6-NEXT: s_waitcnt lgkmcnt(3) ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 @@ -229,23 +235,22 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_u16 v2, v0 offset:10 -; GFX7-NEXT: ds_read_u16 v3, v0 offset:8 -; GFX7-NEXT: ds_read_u16 v1, v0 offset:6 -; GFX7-NEXT: ds_read_u16 v4, v0 offset:4 -; GFX7-NEXT: ds_read_u16 v5, v0 offset:2 -; GFX7-NEXT: ds_read_u16 v0, v0 -; GFX7-NEXT: s_waitcnt lgkmcnt(5) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: ds_read_u16 v2, v0 offset:8 +; GFX7-NEXT: ds_read_u16 v1, v0 offset:4 +; GFX7-NEXT: ds_read_u16 v3, v0 offset:2 +; GFX7-NEXT: ds_read_u16 v4, v0 +; GFX7-NEXT: ds_read_u16 v5, v0 offset:6 +; GFX7-NEXT: ds_read_u16 v6, v0 offset:10 ; GFX7-NEXT: s_waitcnt lgkmcnt(3) -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(2) -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(1) -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: load_lds_v3i32_align2: @@ -257,16 +262,19 @@ ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 8, v0 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, 2, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: ds_read_u16 v1, v1 ; GFX6-NEXT: ds_read_u16 v2, v2 ; GFX6-NEXT: ds_read_u16 v3, v3 ; GFX6-NEXT: ds_read_u16 v4, v4 ; GFX6-NEXT: ds_read_u16 v5, v5 -; GFX6-NEXT: ds_read_u16 v1, v1 ; GFX6-NEXT: ds_read_u16 v0, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(1) +; GFX6-NEXT: s_waitcnt lgkmcnt(5) ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(4) ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: s_waitcnt lgkmcnt(3) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX6-NEXT: s_waitcnt lgkmcnt(1) ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -321,8 +329,8 @@ ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v2, v2 -; GFX6-NEXT: ds_read_b32 v1, v1 ; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: ds_read_b32 v1, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -366,8 +374,8 @@ ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v2, v2 -; GFX6-NEXT: ds_read_b32 v1, v1 ; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: ds_read_b32 v1, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -403,10 +411,11 @@ ; GFX6-LABEL: load_lds_v3i32_align16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v1 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v2 ; GFX6-NEXT: ds_read_b64 v[0:1], v0 +; GFX6-NEXT: ds_read_b32 v2, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll --- a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll @@ -40,11 +40,12 @@ ; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0 ; CI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 0, [[ADDRW]] ; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 12, [[ADDRW]] -; GCN-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 16, [[ADDRW]] +; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 16, [[ADDRW]] ; GCN-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4 ; GCN: s_barrier +; CI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 16, [[ADDRW]] ; GCN-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]] ; GCN-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]] diff --git a/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll b/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll --- a/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll +++ b/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll @@ -4,15 +4,15 @@ define amdgpu_kernel void @zext_shl64_to_32(i64 addrspace(1)* nocapture %out, i32 %x) { ; GCN-LABEL: zext_shl64_to_32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s0, s0, 2 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: s_lshl_b32 s4, s4, 2 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm %and = and i32 %x, 1073741823 %ext = zext i32 %and to i64 @@ -24,16 +24,16 @@ define amdgpu_kernel void @sext_shl64_to_32(i64 addrspace(1)* nocapture %out, i32 %x) { ; GCN-LABEL: sext_shl64_to_32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s0, s0, 0x1fffffff -; GCN-NEXT: s_lshl_b32 s0, s0, 2 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: s_and_b32 s4, s4, 0x1fffffff +; GCN-NEXT: s_lshl_b32 s4, s4, 2 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm %and = and i32 %x, 536870911 %ext = sext i32 %and to i64 @@ -45,17 +45,17 @@ define amdgpu_kernel void @zext_shl64_overflow(i64 addrspace(1)* nocapture %out, i32 %x) { ; GCN-LABEL: zext_shl64_overflow: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb -; GCN-NEXT: s_mov_b32 s1, 0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bitset0_b32 s0, 31 -; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: s_bitset0_b32 s4, 31 +; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm %and = and i32 %x, 2147483647 %ext = zext i32 %and to i64 @@ -67,17 +67,17 @@ define amdgpu_kernel void @sext_shl64_overflow(i64 addrspace(1)* nocapture %out, i32 %x) { ; GCN-LABEL: sext_shl64_overflow: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb -; GCN-NEXT: s_mov_b32 s1, 0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bitset0_b32 s0, 31 -; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: s_bitset0_b32 s4, 31 +; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm %and = and i32 %x, 2147483647 %ext = sext i32 %and to i64 @@ -112,22 +112,22 @@ define amdgpu_kernel void @muli24_shl64(i64 addrspace(1)* nocapture %arg, i32 addrspace(1)* nocapture readonly %arg1) { ; GCN-LABEL: muli24_shl64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b64 s[0:1], s[6:7] -; GCN-NEXT: buffer_load_dword v1, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dword v1, v[1:2], s[4:7], 0 addr64 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 3, v0 -; GCN-NEXT: s_mov_b64 s[6:7], s[2:3] +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] ; GCN-NEXT: v_mov_b32_e32 v4, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_or_b32_e32 v0, 0x800000, v1 ; GCN-NEXT: v_mul_i32_i24_e32 v0, -7, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GCN-NEXT: buffer_store_dwordx2 v[1:2], v[3:4], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dwordx2 v[1:2], v[3:4], s[0:3], 0 addr64 ; GCN-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -7,9 +7,9 @@ define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { ; GFX9-LABEL: s_lshr_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: s_load_dword s5, s[0:1], 0x30 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 @@ -19,27 +19,27 @@ ; ; VI-LABEL: s_lshr_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x30 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dword s3, s[0:1], 0x30 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_and_b32 s1, s4, 0xffff -; VI-NEXT: s_lshr_b32 s4, s4, 16 -; VI-NEXT: s_lshr_b32 s5, s0, 16 -; VI-NEXT: s_lshr_b32 s4, s4, s5 -; VI-NEXT: s_lshr_b32 s0, s1, s0 -; VI-NEXT: s_lshl_b32 s1, s4, 16 -; VI-NEXT: s_or_b32 s0, s0, s1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_and_b32 s4, s2, 0xffff +; VI-NEXT: s_lshr_b32 s2, s2, 16 +; VI-NEXT: s_lshr_b32 s5, s3, 16 +; VI-NEXT: s_lshr_b32 s2, s2, s5 +; VI-NEXT: s_lshr_b32 s3, s4, s3 +; VI-NEXT: s_lshl_b32 s2, s2, 16 +; VI-NEXT: s_or_b32 s2, s3, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; CI-LABEL: s_lshr_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; CI-NEXT: s_load_dword s2, s[0:1], 0xb +; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; CI-NEXT: s_load_dword s0, s[0:1], 0xc ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 @@ -108,16 +108,16 @@ ; ; CI-LABEL: v_lshr_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[0:1], s[6:7] -; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 -; CI-NEXT: s_mov_b64 s[6:7], s[2:3] +; CI-NEXT: s_mov_b64 s[4:5], s[2:3] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 +; CI-NEXT: s_mov_b64 s[2:3], s[6:7] ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -127,7 +127,7 @@ ; CI-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 -; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm ; ; GFX10-LABEL: v_lshr_v2i16: @@ -346,22 +346,22 @@ ; ; CI-LABEL: lshr_imm_v_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[0:1], s[6:7] -; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: s_mov_b64 s[6:7], s[2:3] +; CI-NEXT: s_mov_b64 s[4:5], s[2:3] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: s_mov_b64 s[2:3], s[6:7] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; CI-NEXT: v_lshr_b32_e32 v3, 8, v3 ; CI-NEXT: v_lshr_b32_e32 v2, 8, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 -; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm ; ; GFX10-LABEL: lshr_imm_v_v2i16: @@ -404,32 +404,32 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v0 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_lshrrev_b32_e32 v2, 24, v3 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; CI-LABEL: lshr_v_imm_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[0:1], s[6:7] -; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: s_mov_b64 s[6:7], s[2:3] +; CI-NEXT: s_mov_b64 s[4:5], s[2:3] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: s_mov_b64 s[2:3], s[6:7] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff, v2 -; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm ; ; GFX10-LABEL: lshr_v_imm_v2i16: @@ -493,22 +493,22 @@ ; ; CI-LABEL: v_lshr_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[0:1], s[6:7] -; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 -; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 -; CI-NEXT: s_mov_b32 s0, 0xffff -; CI-NEXT: s_mov_b64 s[6:7], s[2:3] +; CI-NEXT: s_mov_b64 s[4:5], s[2:3] +; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 +; CI-NEXT: s_mov_b32 s4, 0xffff +; CI-NEXT: s_mov_b64 s[2:3], s[6:7] ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; CI-NEXT: v_and_b32_e32 v2, s0, v2 +; CI-NEXT: v_and_b32_e32 v2, s4, v2 ; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; CI-NEXT: v_and_b32_e32 v3, s0, v3 +; CI-NEXT: v_and_b32_e32 v3, s4, v3 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 ; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 @@ -520,7 +520,7 @@ ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; CI-NEXT: v_or_b32_e32 v3, v3, v5 ; CI-NEXT: v_or_b32_e32 v2, v2, v4 -; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm ; ; GFX10-LABEL: v_lshr_v4i16: @@ -585,22 +585,22 @@ ; ; CI-LABEL: lshr_v_imm_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[0:1], s[6:7] -; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 -; CI-NEXT: s_mov_b32 s0, 0xff00ff -; CI-NEXT: s_mov_b64 s[6:7], s[2:3] +; CI-NEXT: s_mov_b64 s[4:5], s[2:3] +; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; CI-NEXT: s_mov_b32 s4, 0xff00ff +; CI-NEXT: s_mov_b64 s[2:3], s[6:7] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 8, v3 ; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2 -; CI-NEXT: v_and_b32_e32 v3, s0, v3 -; CI-NEXT: v_and_b32_e32 v2, s0, v2 -; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; CI-NEXT: v_and_b32_e32 v3, s4, v3 +; CI-NEXT: v_and_b32_e32 v2, s4, v2 +; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm ; ; GFX10-LABEL: lshr_v_imm_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll --- a/llvm/test/CodeGen/AMDGPU/max.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll @@ -16,14 +16,14 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: flat_load_ushort v1, v[2:3] -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_ushort v5, v[0:1] +; VI-NEXT: flat_load_ushort v2, v[2:3] +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_max_i16_e32 v0, v0, v1 -; VI-NEXT: flat_store_short v[4:5], v0 +; VI-NEXT: v_max_i16_e32 v2, v5, v2 +; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_imax_sge_i16: @@ -118,9 +118,9 @@ ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v4, v[4:5] ; VI-NEXT: flat_load_dword v5, v[0:1] +; VI-NEXT: flat_load_dword v7, v[2:3] ; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v7, v[2:3] ; VI-NEXT: flat_load_ushort v8, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v6 @@ -130,32 +130,32 @@ ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_max_i16_e32 v6, v5, v7 ; VI-NEXT: v_max_i16_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_i16_e32 v4, v4, v8 -; VI-NEXT: v_or_b32_e32 v5, v6, v5 ; VI-NEXT: flat_store_short v[2:3], v4 ; VI-NEXT: flat_store_dword v[0:1], v5 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_imax_sge_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_short_d16 v1, v0, s[2:3] offset:4 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v0, s[2:3] ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_short_d16 v2, v0, s[6:7] offset:4 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v4, v0, s[6:7] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_short_d16 v1, v0, s[2:3] offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_pk_max_i16 v1, v2, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_i16 v3, v4, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_max_i16 v1, v2, v1 ; GFX9-NEXT: global_store_short v0, v1, s[4:5] offset:4 ; GFX9-NEXT: global_store_dword v0, v3, s[4:5] ; GFX9-NEXT: s_endpgm @@ -239,14 +239,14 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: flat_load_ushort v1, v[2:3] -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_ushort v5, v[0:1] +; VI-NEXT: flat_load_ushort v2, v[2:3] +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_max_i16_e32 v0, v0, v1 -; VI-NEXT: flat_store_short v[4:5], v0 +; VI-NEXT: v_max_i16_e32 v2, v5, v2 +; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_imax_sgt_i16: @@ -287,14 +287,14 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: flat_load_ushort v1, v[2:3] -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_ushort v5, v[0:1] +; VI-NEXT: flat_load_ushort v2, v[2:3] +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_max_u16_e32 v0, v0, v1 -; VI-NEXT: flat_store_short v[4:5], v0 +; VI-NEXT: v_max_u16_e32 v2, v5, v2 +; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_umax_uge_i16: @@ -335,14 +335,14 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: flat_load_ushort v1, v[2:3] -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_ushort v5, v[0:1] +; VI-NEXT: flat_load_ushort v2, v[2:3] +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_max_u16_e32 v0, v0, v1 -; VI-NEXT: flat_store_short v[4:5], v0 +; VI-NEXT: v_max_u16_e32 v2, v5, v2 +; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_umax_ugt_i16: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll @@ -13,11 +13,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_agent_unordered_load: @@ -52,11 +52,11 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_unordered_load: @@ -65,11 +65,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_agent_unordered_load: @@ -78,11 +78,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -98,11 +98,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_agent_monotonic_load: @@ -137,11 +137,11 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_load: @@ -150,11 +150,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_load: @@ -163,11 +163,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -183,12 +183,12 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_agent_acquire_load: @@ -227,11 +227,11 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_load: @@ -240,12 +240,12 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_load: @@ -254,12 +254,12 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -276,12 +276,12 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_load: @@ -325,11 +325,11 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_load: @@ -339,12 +339,12 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_load: @@ -354,12 +354,12 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -371,12 +371,12 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; GFX7-LABEL: flat_agent_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -406,32 +406,32 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_agent_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { @@ -443,12 +443,12 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; GFX7-LABEL: flat_agent_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -478,32 +478,32 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { @@ -515,12 +515,12 @@ define amdgpu_kernel void @flat_agent_release_store( ; GFX7-LABEL: flat_agent_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -555,34 +555,34 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -595,12 +595,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; GFX7-LABEL: flat_agent_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -635,34 +635,34 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -4742,11 +4742,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_agent_one_as_unordered_load: @@ -4781,11 +4781,11 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_load: @@ -4794,11 +4794,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_load: @@ -4807,11 +4807,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -4827,11 +4827,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_load: @@ -4866,11 +4866,11 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_load: @@ -4879,11 +4879,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_load: @@ -4892,11 +4892,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -4912,13 +4912,13 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_load: @@ -4959,12 +4959,12 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_load: @@ -4973,13 +4973,13 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_load: @@ -4988,12 +4988,12 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -5010,13 +5010,13 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_load: @@ -5062,12 +5062,12 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_load: @@ -5077,13 +5077,13 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_load: @@ -5093,12 +5093,12 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -5110,12 +5110,12 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; GFX7-LABEL: flat_agent_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -5145,32 +5145,32 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { @@ -5182,12 +5182,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; GFX7-LABEL: flat_agent_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -5217,32 +5217,32 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { @@ -5254,12 +5254,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; GFX7-LABEL: flat_agent_one_as_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -5294,34 +5294,34 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -5334,12 +5334,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; GFX7-LABEL: flat_agent_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -5374,34 +5374,34 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll @@ -13,11 +13,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc slc -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc slc +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_nontemporal_load_0: @@ -52,11 +52,11 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc slc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc slc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_0: @@ -65,11 +65,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc slc -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc slc +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_0: @@ -78,14 +78,12 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc slc -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc slc +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %in, i32* %out) { entry: %val = load i32, i32* %in, align 4, !nontemporal !0 @@ -97,12 +95,12 @@ ; GFX7-LABEL: flat_nontemporal_load_1: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GFX7-NEXT: flat_load_dword v2, v[2:3] glc slc +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc slc ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -140,12 +138,12 @@ ; SKIP-CACHE-INV-LABEL: flat_nontemporal_load_1: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; SKIP-CACHE-INV-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[2:3] glc slc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; SKIP-CACHE-INV-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc slc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -155,12 +153,12 @@ ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_1: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s1 -; GFX90A-NOTTGSPLIT-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX90A-NOTTGSPLIT-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[2:3] glc slc +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc slc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -170,19 +168,17 @@ ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_1: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX90A-TGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s1 -; GFX90A-TGSPLIT-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX90A-TGSPLIT-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[2:3] glc slc +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX90A-TGSPLIT-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc slc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %in, i32* %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -199,11 +195,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 glc slc +; GFX7-NEXT: flat_store_dword v[0:1], v2 glc slc ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_nontemporal_store_0: @@ -238,11 +234,11 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 glc slc +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 glc slc ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_0: @@ -251,11 +247,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 glc slc +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 glc slc ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_0: @@ -264,14 +260,12 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 glc slc +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 glc slc ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %in, i32* %out) { entry: %val = load i32, i32* %in, align 4 @@ -367,8 +361,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 glc slc ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %in, i32* %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll @@ -13,11 +13,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_unordered_load: @@ -52,11 +52,11 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_load: @@ -65,11 +65,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_load: @@ -78,11 +78,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -98,11 +98,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_load: @@ -137,11 +137,11 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_load: @@ -150,11 +150,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_load: @@ -163,11 +163,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -183,11 +183,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_load: @@ -222,11 +222,11 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_load: @@ -235,11 +235,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_load: @@ -248,11 +248,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -268,11 +268,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_load: @@ -307,11 +307,11 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_load: @@ -320,11 +320,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_load: @@ -333,11 +333,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -349,12 +349,12 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; GFX7-LABEL: flat_singlethread_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -384,32 +384,32 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { @@ -421,12 +421,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; GFX7-LABEL: flat_singlethread_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -456,32 +456,32 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { @@ -493,12 +493,12 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; GFX7-LABEL: flat_singlethread_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -528,32 +528,32 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { @@ -565,12 +565,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX7-LABEL: flat_singlethread_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -600,32 +600,32 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { @@ -4136,11 +4136,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_load: @@ -4175,11 +4175,11 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_load: @@ -4188,11 +4188,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_load: @@ -4201,11 +4201,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -4221,11 +4221,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_load: @@ -4260,11 +4260,11 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load: @@ -4273,11 +4273,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load: @@ -4286,11 +4286,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -4306,11 +4306,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_load: @@ -4345,11 +4345,11 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_load: @@ -4358,11 +4358,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_load: @@ -4371,11 +4371,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -4391,11 +4391,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_load: @@ -4430,11 +4430,11 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load: @@ -4443,11 +4443,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load: @@ -4456,11 +4456,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -4472,12 +4472,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; GFX7-LABEL: flat_singlethread_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -4507,32 +4507,32 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { @@ -4544,12 +4544,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -4579,32 +4579,32 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { @@ -4616,12 +4616,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX7-LABEL: flat_singlethread_one_as_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -4651,32 +4651,32 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { @@ -4688,12 +4688,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -4723,32 +4723,32 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll @@ -13,11 +13,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_system_unordered_load: @@ -52,11 +52,11 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_load: @@ -65,11 +65,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_unordered_load: @@ -78,11 +78,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -98,11 +98,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_system_monotonic_load: @@ -137,11 +137,11 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_load: @@ -150,11 +150,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_load: @@ -163,11 +163,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -183,12 +183,12 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_system_acquire_load: @@ -227,11 +227,11 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_load: @@ -240,13 +240,13 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_load: @@ -255,13 +255,13 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -278,12 +278,12 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_system_seq_cst_load: @@ -327,11 +327,11 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_load: @@ -341,13 +341,13 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_load: @@ -357,13 +357,13 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -375,12 +375,12 @@ define amdgpu_kernel void @flat_system_unordered_store( ; GFX7-LABEL: flat_system_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -410,32 +410,32 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { @@ -447,12 +447,12 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; GFX7-LABEL: flat_system_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -482,32 +482,32 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { @@ -519,12 +519,12 @@ define amdgpu_kernel void @flat_system_release_store( ; GFX7-LABEL: flat_system_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -559,23 +559,23 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -583,11 +583,11 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -601,12 +601,12 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; GFX7-LABEL: flat_system_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -641,23 +641,23 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -665,11 +665,11 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4868,11 +4868,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_system_one_as_unordered_load: @@ -4907,11 +4907,11 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_load: @@ -4920,11 +4920,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_load: @@ -4933,11 +4933,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -4953,11 +4953,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_load: @@ -4992,11 +4992,11 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_load: @@ -5005,11 +5005,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_load: @@ -5018,11 +5018,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -5038,13 +5038,13 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_load: @@ -5085,12 +5085,12 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_load: @@ -5099,14 +5099,14 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_load: @@ -5115,13 +5115,13 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -5138,13 +5138,13 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_load: @@ -5190,12 +5190,12 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_load: @@ -5205,14 +5205,14 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_load: @@ -5222,13 +5222,13 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -5240,12 +5240,12 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; GFX7-LABEL: flat_system_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -5275,32 +5275,32 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { @@ -5312,12 +5312,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; GFX7-LABEL: flat_system_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -5347,32 +5347,32 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { @@ -5384,12 +5384,12 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; GFX7-LABEL: flat_system_one_as_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -5424,23 +5424,23 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -5448,11 +5448,11 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -5466,12 +5466,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; GFX7-LABEL: flat_system_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -5506,23 +5506,23 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -5530,11 +5530,11 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll @@ -11,12 +11,12 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_nontemporal_load_0: @@ -53,12 +53,12 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -71,12 +71,12 @@ ; GFX7-LABEL: flat_nontemporal_load_1: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GFX7-NEXT: flat_load_dword v2, v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 @@ -117,12 +117,12 @@ ; SKIP-CACHE-INV-LABEL: flat_nontemporal_load_1: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; SKIP-CACHE-INV-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; SKIP-CACHE-INV-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -145,11 +145,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; @@ -187,11 +187,11 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm i32* %in, i32* %out) { @@ -279,12 +279,12 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_volatile_workgroup_acquire_load: @@ -321,12 +321,12 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -338,12 +338,12 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; GFX7-LABEL: flat_volatile_workgroup_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -377,12 +377,12 @@ ; ; SKIP-CACHE-INV-LABEL: flat_volatile_workgroup_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll @@ -13,11 +13,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_unordered_load: @@ -52,11 +52,11 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_load: @@ -65,11 +65,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_load: @@ -78,11 +78,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -98,11 +98,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_load: @@ -137,11 +137,11 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_load: @@ -150,11 +150,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_load: @@ -163,11 +163,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -183,11 +183,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_load: @@ -222,11 +222,11 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_load: @@ -235,11 +235,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_load: @@ -248,11 +248,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -268,11 +268,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_load: @@ -307,11 +307,11 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_load: @@ -320,11 +320,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_load: @@ -333,11 +333,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -349,12 +349,12 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; GFX7-LABEL: flat_wavefront_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -384,32 +384,32 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { @@ -421,12 +421,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; GFX7-LABEL: flat_wavefront_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -456,32 +456,32 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { @@ -493,12 +493,12 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; GFX7-LABEL: flat_wavefront_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -528,32 +528,32 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { @@ -565,12 +565,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX7-LABEL: flat_wavefront_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -600,32 +600,32 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { @@ -4136,11 +4136,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_load: @@ -4175,11 +4175,11 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_load: @@ -4188,11 +4188,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_load: @@ -4201,11 +4201,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -4221,11 +4221,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_load: @@ -4260,11 +4260,11 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load: @@ -4273,11 +4273,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load: @@ -4286,11 +4286,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -4306,11 +4306,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_load: @@ -4345,11 +4345,11 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_load: @@ -4358,11 +4358,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_load: @@ -4371,11 +4371,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -4391,11 +4391,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_load: @@ -4430,11 +4430,11 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load: @@ -4443,11 +4443,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load: @@ -4456,11 +4456,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -4472,12 +4472,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; GFX7-LABEL: flat_wavefront_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -4507,32 +4507,32 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { @@ -4544,12 +4544,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -4579,32 +4579,32 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { @@ -4616,12 +4616,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX7-LABEL: flat_wavefront_one_as_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -4651,32 +4651,32 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { @@ -4688,12 +4688,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -4723,32 +4723,32 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll @@ -13,11 +13,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_unordered_load: @@ -52,11 +52,11 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_load: @@ -65,11 +65,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_load: @@ -78,11 +78,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -98,11 +98,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_load: @@ -137,11 +137,11 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_load: @@ -150,11 +150,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_load: @@ -163,11 +163,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -183,12 +183,12 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_load: @@ -225,12 +225,12 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_load: @@ -239,12 +239,12 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_load: @@ -253,12 +253,12 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -275,12 +275,12 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_load: @@ -321,12 +321,12 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_load: @@ -336,12 +336,12 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_load: @@ -351,12 +351,12 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -368,12 +368,12 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; GFX7-LABEL: flat_workgroup_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -403,32 +403,32 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { @@ -440,12 +440,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; GFX7-LABEL: flat_workgroup_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -475,32 +475,32 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { @@ -512,12 +512,12 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; GFX7-LABEL: flat_workgroup_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -551,34 +551,34 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -591,12 +591,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX7-LABEL: flat_workgroup_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -630,34 +630,34 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -4169,11 +4169,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_load: @@ -4208,11 +4208,11 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_load: @@ -4221,11 +4221,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_load: @@ -4234,11 +4234,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -4254,11 +4254,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_load: @@ -4293,11 +4293,11 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load: @@ -4306,11 +4306,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load: @@ -4319,11 +4319,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -4339,11 +4339,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_load: @@ -4380,11 +4380,11 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_load: @@ -4393,11 +4393,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_load: @@ -4406,12 +4406,12 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -4427,11 +4427,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_load: @@ -4470,11 +4470,11 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load: @@ -4483,11 +4483,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load: @@ -4497,12 +4497,12 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -4514,12 +4514,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; GFX7-LABEL: flat_workgroup_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -4549,32 +4549,32 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { @@ -4586,12 +4586,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -4621,32 +4621,32 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { @@ -4658,12 +4658,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX7-LABEL: flat_workgroup_one_as_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -4695,32 +4695,32 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -4733,12 +4733,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -4770,32 +4770,32 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll @@ -10,17 +10,15 @@ define amdgpu_kernel void @global_agent_unordered_load( ; GFX6-LABEL: global_agent_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -31,11 +29,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_agent_unordered_load: @@ -60,17 +58,15 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -104,17 +100,15 @@ define amdgpu_kernel void @global_agent_monotonic_load( ; GFX6-LABEL: global_agent_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -125,11 +119,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_agent_monotonic_load: @@ -154,17 +148,15 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -198,19 +190,17 @@ define amdgpu_kernel void @global_agent_acquire_load( ; GFX6-LABEL: global_agent_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -220,12 +210,12 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_agent_acquire_load: @@ -254,18 +244,16 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -300,20 +288,18 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; GFX6-LABEL: global_agent_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -324,12 +310,12 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_agent_seq_cst_load: @@ -360,19 +346,17 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -418,12 +402,12 @@ ; ; GFX7-LABEL: global_agent_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -499,12 +483,12 @@ ; ; GFX7-LABEL: global_agent_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -581,12 +565,12 @@ ; ; GFX7-LABEL: global_agent_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -671,12 +655,12 @@ ; ; GFX7-LABEL: global_agent_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -749,12 +733,12 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; GFX6-LABEL: global_agent_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -793,19 +777,19 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -814,8 +798,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -830,12 +814,12 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; GFX6-LABEL: global_agent_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -884,20 +868,20 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -908,8 +892,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -926,12 +910,12 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; GFX6-LABEL: global_agent_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -976,20 +960,20 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -999,8 +983,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_agent_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1016,12 +1000,12 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; GFX6-LABEL: global_agent_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1076,21 +1060,21 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1102,8 +1086,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1121,12 +1105,12 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; GFX6-LABEL: global_agent_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1181,21 +1165,21 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1207,8 +1191,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1226,12 +1210,12 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; GFX6-LABEL: global_agent_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -1284,21 +1268,21 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1310,8 +1294,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1330,12 +1314,12 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; GFX6-LABEL: global_agent_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1394,22 +1378,22 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1422,8 +1406,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1443,12 +1427,12 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; GFX6-LABEL: global_agent_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1507,22 +1491,22 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1535,8 +1519,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1556,13 +1540,13 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -1606,34 +1590,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -1645,13 +1629,13 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; GFX6-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -1705,37 +1689,37 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -1749,13 +1733,13 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; GFX6-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm @@ -1805,37 +1789,37 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -1847,13 +1831,13 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1913,40 +1897,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -1960,13 +1944,13 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -2026,40 +2010,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2073,13 +2057,13 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; GFX6-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -2133,37 +2117,37 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2177,13 +2161,13 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; GFX6-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -2237,37 +2221,37 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2281,13 +2265,13 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; GFX6-LABEL: global_agent_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -2347,40 +2331,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2394,13 +2378,13 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -2460,40 +2444,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2507,13 +2491,13 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -2573,40 +2557,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2620,13 +2604,13 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -2686,40 +2670,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2733,13 +2717,13 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -2799,40 +2783,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2846,13 +2830,13 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; GFX6-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -2912,40 +2896,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2959,13 +2943,13 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3025,40 +3009,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -3072,13 +3056,13 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3138,40 +3122,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -3185,13 +3169,13 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -3245,40 +3229,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3292,13 +3276,13 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -3358,42 +3342,42 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3407,13 +3391,13 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3473,43 +3457,43 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3523,13 +3507,13 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3595,45 +3579,45 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3647,13 +3631,13 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3719,45 +3703,45 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3771,13 +3755,13 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -3837,42 +3821,42 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3886,13 +3870,13 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -3952,42 +3936,42 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4001,13 +3985,13 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; GFX6-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -4073,45 +4057,45 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4125,13 +4109,13 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -4197,45 +4181,45 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4249,13 +4233,13 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -4321,45 +4305,45 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4373,13 +4357,13 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -4445,45 +4429,45 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4497,13 +4481,13 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -4569,45 +4553,45 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4621,13 +4605,13 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -4693,45 +4677,45 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4745,13 +4729,13 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -4817,45 +4801,45 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4869,13 +4853,13 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -4941,45 +4925,45 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4993,17 +4977,15 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load( ; GFX6-LABEL: global_agent_one_as_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -5014,11 +4996,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_agent_one_as_unordered_load: @@ -5043,17 +5025,15 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -5087,17 +5067,15 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load( ; GFX6-LABEL: global_agent_one_as_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -5108,11 +5086,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_agent_one_as_monotonic_load: @@ -5137,17 +5115,15 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -5181,19 +5157,17 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load( ; GFX6-LABEL: global_agent_one_as_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -5203,12 +5177,12 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_agent_one_as_acquire_load: @@ -5237,18 +5211,16 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -5283,20 +5255,18 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; GFX6-LABEL: global_agent_one_as_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -5307,12 +5277,12 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_load: @@ -5343,19 +5313,17 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -5401,12 +5369,12 @@ ; ; GFX7-LABEL: global_agent_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -5482,12 +5450,12 @@ ; ; GFX7-LABEL: global_agent_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -5564,12 +5532,12 @@ ; ; GFX7-LABEL: global_agent_one_as_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -5654,12 +5622,12 @@ ; ; GFX7-LABEL: global_agent_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -5732,12 +5700,12 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; GFX6-LABEL: global_agent_one_as_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -5776,19 +5744,19 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5797,8 +5765,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5813,12 +5781,12 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; GFX6-LABEL: global_agent_one_as_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -5867,20 +5835,20 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5891,8 +5859,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5909,12 +5877,12 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; GFX6-LABEL: global_agent_one_as_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -5959,20 +5927,20 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5982,8 +5950,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5999,12 +5967,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; GFX6-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6059,21 +6027,21 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6085,8 +6053,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6104,12 +6072,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; GFX6-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6164,21 +6132,21 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6190,8 +6158,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6209,12 +6177,12 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; GFX6-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -6267,21 +6235,21 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6293,8 +6261,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6313,12 +6281,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; GFX6-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6377,22 +6345,22 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6405,8 +6373,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6426,12 +6394,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; GFX6-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6490,22 +6458,22 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6518,8 +6486,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6539,13 +6507,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6589,34 +6557,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6628,13 +6596,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; GFX6-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -6688,37 +6656,37 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -6732,13 +6700,13 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; GFX6-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm @@ -6788,37 +6756,37 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6830,13 +6798,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6896,40 +6864,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -6943,13 +6911,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7009,40 +6977,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -7056,13 +7024,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; GFX6-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -7116,37 +7084,37 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -7160,13 +7128,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; GFX6-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -7220,37 +7188,37 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -7264,13 +7232,13 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; GFX6-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7330,40 +7298,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -7377,13 +7345,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7443,40 +7411,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -7490,13 +7458,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7556,40 +7524,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -7603,13 +7571,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7669,40 +7637,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -7716,13 +7684,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7782,40 +7750,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -7829,13 +7797,13 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; GFX6-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7895,40 +7863,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -7942,13 +7910,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8008,40 +7976,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -8055,13 +8023,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8121,40 +8089,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -8168,13 +8136,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -8228,40 +8196,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8275,13 +8243,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -8341,42 +8309,42 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8390,13 +8358,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8462,45 +8430,45 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8514,13 +8482,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8586,45 +8554,45 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8638,13 +8606,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -8704,42 +8672,42 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8753,13 +8721,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -8819,42 +8787,42 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8868,13 +8836,13 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8940,45 +8908,45 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8992,13 +8960,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -9064,45 +9032,45 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -9116,13 +9084,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -9188,45 +9156,45 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -9240,13 +9208,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -9312,45 +9280,45 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -9364,13 +9332,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -9436,45 +9404,45 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -9488,13 +9456,13 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -9560,45 +9528,45 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -9612,13 +9580,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -9684,45 +9652,45 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -9736,13 +9704,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -9808,45 +9776,45 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll @@ -91,8 +91,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load i32, i32 addrspace(1)* %in, align 4, !nontemporal !0 @@ -103,30 +101,30 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; GFX6-LABEL: global_nontemporal_load_1: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s6 -; GFX6-NEXT: s_mov_b32 s1, s7 -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s7, s3 -; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc slc +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 +; GFX6-NEXT: s_mov_b32 s2, 0 +; GFX6-NEXT: s_mov_b32 s3, s7 +; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc slc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_nontemporal_load_1: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GFX7-NEXT: flat_load_dword v2, v[2:3] glc slc +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc slc ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -157,19 +155,19 @@ ; ; SKIP-CACHE-INV-LABEL: global_nontemporal_load_1: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc slc +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s7 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc slc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_nontemporal_load_1: @@ -193,8 +191,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -288,8 +284,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] glc slc ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load i32, i32 addrspace(1)* %in, align 4 @@ -385,8 +379,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] glc slc ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll @@ -10,17 +10,15 @@ define amdgpu_kernel void @global_singlethread_unordered_load( ; GFX6-LABEL: global_singlethread_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -31,11 +29,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_unordered_load: @@ -60,17 +58,15 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -104,17 +100,15 @@ define amdgpu_kernel void @global_singlethread_monotonic_load( ; GFX6-LABEL: global_singlethread_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -125,11 +119,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_monotonic_load: @@ -154,17 +148,15 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -198,17 +190,15 @@ define amdgpu_kernel void @global_singlethread_acquire_load( ; GFX6-LABEL: global_singlethread_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -219,11 +209,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_acquire_load: @@ -248,17 +238,15 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -292,17 +280,15 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; GFX6-LABEL: global_singlethread_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -313,11 +299,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_seq_cst_load: @@ -342,17 +328,15 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -397,12 +381,12 @@ ; ; GFX7-LABEL: global_singlethread_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -478,12 +462,12 @@ ; ; GFX7-LABEL: global_singlethread_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -559,12 +543,12 @@ ; ; GFX7-LABEL: global_singlethread_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -640,12 +624,12 @@ ; ; GFX7-LABEL: global_singlethread_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -710,12 +694,12 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; GFX6-LABEL: global_singlethread_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -754,19 +738,19 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -775,8 +759,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -791,12 +775,12 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; GFX6-LABEL: global_singlethread_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -835,19 +819,19 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -856,8 +840,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -872,12 +856,12 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX6-LABEL: global_singlethread_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -916,19 +900,19 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -937,8 +921,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -953,12 +937,12 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX6-LABEL: global_singlethread_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -997,19 +981,19 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1018,8 +1002,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1034,12 +1018,12 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX6-LABEL: global_singlethread_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -1078,19 +1062,19 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1099,8 +1083,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1115,12 +1099,12 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; GFX6-LABEL: global_singlethread_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1167,21 +1151,21 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1192,8 +1176,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1211,12 +1195,12 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX6-LABEL: global_singlethread_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1263,21 +1247,21 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1288,8 +1272,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1307,12 +1291,12 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX6-LABEL: global_singlethread_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1359,21 +1343,21 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1384,8 +1368,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1403,13 +1387,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -1453,34 +1437,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -1492,13 +1476,13 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; GFX6-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -1542,34 +1526,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -1581,13 +1565,13 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; GFX6-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -1631,34 +1615,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -1670,13 +1654,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -1720,34 +1704,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -1759,13 +1743,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -1809,34 +1793,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -1848,13 +1832,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; GFX6-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -1898,34 +1882,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -1937,13 +1921,13 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; GFX6-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -1987,34 +1971,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2026,13 +2010,13 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; GFX6-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -2076,34 +2060,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2115,13 +2099,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -2165,34 +2149,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2204,13 +2188,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -2254,34 +2238,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2293,13 +2277,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -2343,34 +2327,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2382,13 +2366,13 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -2432,34 +2416,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2471,13 +2455,13 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; GFX6-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -2521,34 +2505,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2560,13 +2544,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -2610,34 +2594,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2649,13 +2633,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -2699,34 +2683,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2738,13 +2722,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -2798,40 +2782,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2845,13 +2829,13 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -2905,40 +2889,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2952,13 +2936,13 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -3012,40 +2996,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3059,13 +3043,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -3119,40 +3103,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3166,13 +3150,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -3226,40 +3210,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3273,13 +3257,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -3333,40 +3317,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3380,13 +3364,13 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -3440,40 +3424,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3487,13 +3471,13 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -3547,40 +3531,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3594,13 +3578,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -3654,40 +3638,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3701,13 +3685,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -3761,40 +3745,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3808,13 +3792,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -3868,40 +3852,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3915,13 +3899,13 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -3975,40 +3959,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4022,13 +4006,13 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -4082,40 +4066,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4129,13 +4113,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -4189,40 +4173,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4236,13 +4220,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -4296,40 +4280,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4343,17 +4327,15 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load( ; GFX6-LABEL: global_singlethread_one_as_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -4364,11 +4346,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_one_as_unordered_load: @@ -4393,17 +4375,15 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -4437,17 +4417,15 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load( ; GFX6-LABEL: global_singlethread_one_as_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -4458,11 +4436,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_load: @@ -4487,17 +4465,15 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -4531,17 +4507,15 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load( ; GFX6-LABEL: global_singlethread_one_as_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -4552,11 +4526,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_load: @@ -4581,17 +4555,15 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -4625,17 +4597,15 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; GFX6-LABEL: global_singlethread_one_as_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -4646,11 +4616,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_load: @@ -4675,17 +4645,15 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -4730,12 +4698,12 @@ ; ; GFX7-LABEL: global_singlethread_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -4811,12 +4779,12 @@ ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -4892,12 +4860,12 @@ ; ; GFX7-LABEL: global_singlethread_one_as_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -4973,12 +4941,12 @@ ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -5043,12 +5011,12 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; GFX6-LABEL: global_singlethread_one_as_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -5087,19 +5055,19 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5108,8 +5076,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5124,12 +5092,12 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; GFX6-LABEL: global_singlethread_one_as_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -5168,19 +5136,19 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5189,8 +5157,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5205,12 +5173,12 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX6-LABEL: global_singlethread_one_as_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -5249,19 +5217,19 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5270,8 +5238,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5286,12 +5254,12 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX6-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -5330,19 +5298,19 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5351,8 +5319,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5367,12 +5335,12 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX6-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -5411,19 +5379,19 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5432,8 +5400,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5448,12 +5416,12 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; GFX6-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -5500,21 +5468,21 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5525,8 +5493,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5544,12 +5512,12 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX6-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -5596,21 +5564,21 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5621,8 +5589,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5640,12 +5608,12 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX6-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -5692,21 +5660,21 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5717,8 +5685,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5736,13 +5704,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -5786,34 +5754,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -5825,13 +5793,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -5875,34 +5843,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -5914,13 +5882,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -5964,34 +5932,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6003,13 +5971,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6053,34 +6021,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6092,13 +6060,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6142,34 +6110,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6181,13 +6149,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6231,34 +6199,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6270,13 +6238,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6320,34 +6288,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6359,13 +6327,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6409,34 +6377,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6448,13 +6416,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6498,34 +6466,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6537,13 +6505,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6587,34 +6555,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6626,13 +6594,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6676,34 +6644,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6715,13 +6683,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6765,34 +6733,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6804,13 +6772,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6854,34 +6822,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6893,13 +6861,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6943,34 +6911,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6982,13 +6950,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -7032,34 +7000,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -7071,13 +7039,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -7131,40 +7099,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -7178,13 +7146,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -7238,40 +7206,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -7285,13 +7253,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -7345,40 +7313,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -7392,13 +7360,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -7452,40 +7420,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -7499,13 +7467,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -7559,40 +7527,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -7606,13 +7574,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -7666,40 +7634,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -7713,13 +7681,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -7773,40 +7741,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -7820,13 +7788,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -7880,40 +7848,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -7927,13 +7895,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -7987,40 +7955,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8034,13 +8002,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -8094,40 +8062,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8141,13 +8109,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -8201,40 +8169,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8248,13 +8216,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -8308,40 +8276,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8355,13 +8323,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -8415,40 +8383,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8462,13 +8430,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -8522,40 +8490,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8569,13 +8537,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -8629,40 +8597,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll @@ -10,17 +10,15 @@ define amdgpu_kernel void @global_system_unordered_load( ; GFX6-LABEL: global_system_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -31,11 +29,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_system_unordered_load: @@ -60,17 +58,15 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -104,17 +100,15 @@ define amdgpu_kernel void @global_system_monotonic_load( ; GFX6-LABEL: global_system_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -125,11 +119,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_system_monotonic_load: @@ -154,17 +148,15 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -198,19 +190,17 @@ define amdgpu_kernel void @global_system_acquire_load( ; GFX6-LABEL: global_system_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -220,12 +210,12 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_system_acquire_load: @@ -254,18 +244,16 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -302,20 +290,18 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; GFX6-LABEL: global_system_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -326,12 +312,12 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_system_seq_cst_load: @@ -362,19 +348,17 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -422,12 +406,12 @@ ; ; GFX7-LABEL: global_system_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -503,12 +487,12 @@ ; ; GFX7-LABEL: global_system_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -585,12 +569,12 @@ ; ; GFX7-LABEL: global_system_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -677,12 +661,12 @@ ; ; GFX7-LABEL: global_system_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -757,12 +741,12 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; GFX6-LABEL: global_system_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -801,19 +785,19 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -822,8 +806,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -838,12 +822,12 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; GFX6-LABEL: global_system_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -892,20 +876,20 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -917,8 +901,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -936,12 +920,12 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; GFX6-LABEL: global_system_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -986,20 +970,20 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1010,8 +994,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1028,12 +1012,12 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; GFX6-LABEL: global_system_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1088,21 +1072,21 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1116,8 +1100,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1137,12 +1121,12 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; GFX6-LABEL: global_system_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1197,21 +1181,21 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1225,8 +1209,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1246,12 +1230,12 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; GFX6-LABEL: global_system_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -1304,21 +1288,21 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1331,8 +1315,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1352,12 +1336,12 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; GFX6-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1416,22 +1400,22 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1446,8 +1430,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1469,12 +1453,12 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; GFX6-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1533,22 +1517,22 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1563,8 +1547,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1586,13 +1570,13 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -1636,34 +1620,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -1675,13 +1659,13 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; GFX6-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -1735,25 +1719,25 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -1761,12 +1745,12 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -1781,13 +1765,13 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; GFX6-LABEL: global_system_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm @@ -1837,39 +1821,39 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -1881,13 +1865,13 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1947,28 +1931,28 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -1976,14 +1960,14 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -1998,13 +1982,13 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -2064,28 +2048,28 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -2093,14 +2077,14 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -2115,13 +2099,13 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; GFX6-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -2175,25 +2159,25 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -2201,12 +2185,12 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -2221,13 +2205,13 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; GFX6-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -2281,25 +2265,25 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -2307,12 +2291,12 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -2327,13 +2311,13 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; GFX6-LABEL: global_system_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -2393,28 +2377,28 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -2422,14 +2406,14 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -2444,13 +2428,13 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -2510,28 +2494,28 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -2539,14 +2523,14 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -2561,13 +2545,13 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -2627,28 +2611,28 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -2656,14 +2640,14 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -2678,13 +2662,13 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -2744,28 +2728,28 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -2773,14 +2757,14 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -2795,13 +2779,13 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -2855,40 +2839,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2902,13 +2886,13 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -2968,44 +2952,44 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3019,13 +3003,13 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3091,49 +3075,49 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3147,13 +3131,13 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3219,49 +3203,49 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3275,13 +3259,13 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -3341,44 +3325,44 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3392,13 +3376,13 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -3458,44 +3442,44 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3509,13 +3493,13 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; GFX6-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3581,49 +3565,49 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3637,13 +3621,13 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3709,49 +3693,49 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3765,13 +3749,13 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3837,49 +3821,49 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3893,13 +3877,13 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3965,49 +3949,49 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4021,13 +4005,13 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -4093,49 +4077,49 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4149,13 +4133,13 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -4221,49 +4205,49 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4277,13 +4261,13 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -4349,49 +4333,49 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4405,13 +4389,13 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -4477,49 +4461,49 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4533,17 +4517,15 @@ define amdgpu_kernel void @global_system_one_as_unordered_load( ; GFX6-LABEL: global_system_one_as_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -4554,11 +4536,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_system_one_as_unordered_load: @@ -4583,17 +4565,15 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -4627,17 +4607,15 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load( ; GFX6-LABEL: global_system_one_as_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -4648,11 +4626,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_system_one_as_monotonic_load: @@ -4677,17 +4655,15 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -4721,19 +4697,17 @@ define amdgpu_kernel void @global_system_one_as_acquire_load( ; GFX6-LABEL: global_system_one_as_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -4743,12 +4717,12 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_system_one_as_acquire_load: @@ -4777,18 +4751,16 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -4825,20 +4797,18 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; GFX6-LABEL: global_system_one_as_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -4849,12 +4819,12 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_load: @@ -4885,19 +4855,17 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -4945,12 +4913,12 @@ ; ; GFX7-LABEL: global_system_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -5026,12 +4994,12 @@ ; ; GFX7-LABEL: global_system_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -5108,12 +5076,12 @@ ; ; GFX7-LABEL: global_system_one_as_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -5200,12 +5168,12 @@ ; ; GFX7-LABEL: global_system_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -5280,12 +5248,12 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; GFX6-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -5324,19 +5292,19 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5345,8 +5313,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5361,12 +5329,12 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; GFX6-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -5415,20 +5383,20 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5440,8 +5408,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5459,12 +5427,12 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; GFX6-LABEL: global_system_one_as_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -5509,20 +5477,20 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5533,8 +5501,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5551,12 +5519,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; GFX6-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -5611,21 +5579,21 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5639,8 +5607,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5660,12 +5628,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; GFX6-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -5720,21 +5688,21 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5748,8 +5716,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5769,12 +5737,12 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; GFX6-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -5827,21 +5795,21 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5854,8 +5822,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5875,12 +5843,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; GFX6-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -5939,22 +5907,22 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5969,8 +5937,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5992,12 +5960,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; GFX6-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6056,22 +6024,22 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6086,8 +6054,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6109,13 +6077,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6159,34 +6127,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6198,13 +6166,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; GFX6-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -6258,25 +6226,25 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -6284,12 +6252,12 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -6304,13 +6272,13 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; GFX6-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm @@ -6360,39 +6328,39 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6404,13 +6372,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6470,28 +6438,28 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -6499,14 +6467,14 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -6521,13 +6489,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6587,28 +6555,28 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -6616,14 +6584,14 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -6638,13 +6606,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; GFX6-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -6698,25 +6666,25 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -6724,12 +6692,12 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -6744,13 +6712,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; GFX6-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -6804,25 +6772,25 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -6830,12 +6798,12 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -6850,13 +6818,13 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; GFX6-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6916,28 +6884,28 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -6945,14 +6913,14 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -6967,13 +6935,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7033,28 +7001,28 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -7062,14 +7030,14 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -7084,13 +7052,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7150,28 +7118,28 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -7179,14 +7147,14 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -7201,13 +7169,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7267,28 +7235,28 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -7296,14 +7264,14 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -7318,13 +7286,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7384,28 +7352,28 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -7413,14 +7381,14 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -7435,13 +7403,13 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; GFX6-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7501,28 +7469,28 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -7530,14 +7498,14 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -7552,13 +7520,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7618,28 +7586,28 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -7647,14 +7615,14 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -7669,13 +7637,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7735,28 +7703,28 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -7764,14 +7732,14 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -7786,13 +7754,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -7846,40 +7814,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -7893,13 +7861,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -7959,44 +7927,44 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8010,13 +7978,13 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8076,45 +8044,45 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8128,13 +8096,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8200,49 +8168,49 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8256,13 +8224,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8328,49 +8296,49 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8384,13 +8352,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -8450,44 +8418,44 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8501,13 +8469,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -8567,44 +8535,44 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8618,13 +8586,13 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8690,49 +8658,49 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8746,13 +8714,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8818,49 +8786,49 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8874,13 +8842,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8946,49 +8914,49 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -9002,13 +8970,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -9074,49 +9042,49 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -9130,13 +9098,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -9202,49 +9170,49 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -9258,13 +9226,13 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -9330,49 +9298,49 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -9386,13 +9354,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -9458,49 +9426,49 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -9514,13 +9482,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -9586,49 +9554,49 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll @@ -8,18 +8,16 @@ define amdgpu_kernel void @global_volatile_load_0( ; GFX6-LABEL: global_volatile_load_0: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -29,11 +27,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_volatile_load_0: @@ -58,18 +56,16 @@ ; ; SKIP-CACHE-INV-LABEL: global_volatile_load_0: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { @@ -82,30 +78,30 @@ define amdgpu_kernel void @global_volatile_load_1( ; GFX6-LABEL: global_volatile_load_1: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s6 -; GFX6-NEXT: s_mov_b32 s1, s7 -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s7, s3 -; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 +; GFX6-NEXT: s_mov_b32 s2, 0 +; GFX6-NEXT: s_mov_b32 s3, s7 +; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_volatile_load_1: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GFX7-NEXT: flat_load_dword v2, v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 @@ -136,19 +132,19 @@ ; ; SKIP-CACHE-INV-LABEL: global_volatile_load_1: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s7 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -316,17 +312,15 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load( ; GFX6-LABEL: global_volatile_workgroup_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -337,11 +331,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_volatile_workgroup_acquire_load: @@ -367,17 +361,15 @@ ; ; SKIP-CACHE-INV-LABEL: global_volatile_workgroup_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -403,12 +395,12 @@ ; ; GFX7-LABEL: global_volatile_workgroup_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll @@ -10,17 +10,15 @@ define amdgpu_kernel void @global_wavefront_unordered_load( ; GFX6-LABEL: global_wavefront_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -31,11 +29,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_unordered_load: @@ -60,17 +58,15 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -104,17 +100,15 @@ define amdgpu_kernel void @global_wavefront_monotonic_load( ; GFX6-LABEL: global_wavefront_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -125,11 +119,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_monotonic_load: @@ -154,17 +148,15 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -198,17 +190,15 @@ define amdgpu_kernel void @global_wavefront_acquire_load( ; GFX6-LABEL: global_wavefront_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -219,11 +209,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_acquire_load: @@ -248,17 +238,15 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -292,17 +280,15 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; GFX6-LABEL: global_wavefront_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -313,11 +299,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_seq_cst_load: @@ -342,17 +328,15 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -397,12 +381,12 @@ ; ; GFX7-LABEL: global_wavefront_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -478,12 +462,12 @@ ; ; GFX7-LABEL: global_wavefront_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -559,12 +543,12 @@ ; ; GFX7-LABEL: global_wavefront_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -640,12 +624,12 @@ ; ; GFX7-LABEL: global_wavefront_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -710,12 +694,12 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; GFX6-LABEL: global_wavefront_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -754,19 +738,19 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -775,8 +759,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -791,12 +775,12 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; GFX6-LABEL: global_wavefront_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -835,19 +819,19 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -856,8 +840,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -872,12 +856,12 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX6-LABEL: global_wavefront_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -916,19 +900,19 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -937,8 +921,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -953,12 +937,12 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX6-LABEL: global_wavefront_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -997,19 +981,19 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1018,8 +1002,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1034,12 +1018,12 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX6-LABEL: global_wavefront_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -1078,19 +1062,19 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1099,8 +1083,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1115,12 +1099,12 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; GFX6-LABEL: global_wavefront_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1167,21 +1151,21 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1192,8 +1176,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1211,12 +1195,12 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX6-LABEL: global_wavefront_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1263,21 +1247,21 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1288,8 +1272,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1307,12 +1291,12 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX6-LABEL: global_wavefront_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1359,21 +1343,21 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1384,8 +1368,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1403,13 +1387,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -1453,34 +1437,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -1492,13 +1476,13 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; GFX6-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -1542,34 +1526,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -1581,13 +1565,13 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; GFX6-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -1631,34 +1615,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -1670,13 +1654,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -1720,34 +1704,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -1759,13 +1743,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -1809,34 +1793,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -1848,13 +1832,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; GFX6-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -1898,34 +1882,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -1937,13 +1921,13 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; GFX6-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -1987,34 +1971,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2026,13 +2010,13 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; GFX6-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -2076,34 +2060,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2115,13 +2099,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -2165,34 +2149,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2204,13 +2188,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -2254,34 +2238,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2293,13 +2277,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -2343,34 +2327,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2382,13 +2366,13 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -2432,34 +2416,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2471,13 +2455,13 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; GFX6-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -2521,34 +2505,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2560,13 +2544,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -2610,34 +2594,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2649,13 +2633,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -2699,34 +2683,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2738,13 +2722,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -2798,40 +2782,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2845,13 +2829,13 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -2905,40 +2889,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2952,13 +2936,13 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -3012,40 +2996,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3059,13 +3043,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -3119,40 +3103,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3166,13 +3150,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -3226,40 +3210,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3273,13 +3257,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -3333,40 +3317,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3380,13 +3364,13 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -3440,40 +3424,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3487,13 +3471,13 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -3547,40 +3531,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3594,13 +3578,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -3654,40 +3638,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3701,13 +3685,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -3761,40 +3745,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3808,13 +3792,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -3868,40 +3852,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3915,13 +3899,13 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -3975,40 +3959,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4022,13 +4006,13 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -4082,40 +4066,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4129,13 +4113,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -4189,40 +4173,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4236,13 +4220,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -4296,40 +4280,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4343,17 +4327,15 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load( ; GFX6-LABEL: global_wavefront_one_as_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -4364,11 +4346,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_one_as_unordered_load: @@ -4393,17 +4375,15 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -4437,17 +4417,15 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load( ; GFX6-LABEL: global_wavefront_one_as_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -4458,11 +4436,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_load: @@ -4487,17 +4465,15 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -4531,17 +4507,15 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load( ; GFX6-LABEL: global_wavefront_one_as_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -4552,11 +4526,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_load: @@ -4581,17 +4555,15 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -4625,17 +4597,15 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( ; GFX6-LABEL: global_wavefront_one_as_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -4646,11 +4616,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_load: @@ -4675,17 +4645,15 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -4730,12 +4698,12 @@ ; ; GFX7-LABEL: global_wavefront_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -4811,12 +4779,12 @@ ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -4892,12 +4860,12 @@ ; ; GFX7-LABEL: global_wavefront_one_as_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -4973,12 +4941,12 @@ ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -5043,12 +5011,12 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; GFX6-LABEL: global_wavefront_one_as_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -5087,19 +5055,19 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5108,8 +5076,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5124,12 +5092,12 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; GFX6-LABEL: global_wavefront_one_as_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -5168,19 +5136,19 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5189,8 +5157,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5205,12 +5173,12 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; GFX6-LABEL: global_wavefront_one_as_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -5249,19 +5217,19 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5270,8 +5238,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5286,12 +5254,12 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; GFX6-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -5330,19 +5298,19 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5351,8 +5319,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5367,12 +5335,12 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; GFX6-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -5411,19 +5379,19 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5432,8 +5400,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5448,12 +5416,12 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; GFX6-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -5500,21 +5468,21 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5525,8 +5493,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5544,12 +5512,12 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX6-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -5596,21 +5564,21 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5621,8 +5589,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5640,12 +5608,12 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX6-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -5692,21 +5660,21 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5717,8 +5685,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5736,13 +5704,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -5786,34 +5754,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -5825,13 +5793,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -5875,34 +5843,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -5914,13 +5882,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -5964,34 +5932,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6003,13 +5971,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6053,34 +6021,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6092,13 +6060,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6142,34 +6110,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6181,13 +6149,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6231,34 +6199,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6270,13 +6238,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6320,34 +6288,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6359,13 +6327,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6409,34 +6377,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6448,13 +6416,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6498,34 +6466,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6537,13 +6505,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6587,34 +6555,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6626,13 +6594,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6676,34 +6644,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6715,13 +6683,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6765,34 +6733,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6804,13 +6772,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6854,34 +6822,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6893,13 +6861,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6943,34 +6911,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6982,13 +6950,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -7032,34 +7000,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -7071,13 +7039,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -7131,40 +7099,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -7178,13 +7146,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -7238,40 +7206,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -7285,13 +7253,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -7345,40 +7313,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -7392,13 +7360,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -7452,40 +7420,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -7499,13 +7467,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -7559,40 +7527,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -7606,13 +7574,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -7666,40 +7634,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -7713,13 +7681,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -7773,40 +7741,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -7820,13 +7788,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -7880,40 +7848,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -7927,13 +7895,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -7987,40 +7955,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8034,13 +8002,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -8094,40 +8062,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8141,13 +8109,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -8201,40 +8169,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8248,13 +8216,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -8308,40 +8276,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8355,13 +8323,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -8415,40 +8383,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8462,13 +8430,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -8522,40 +8490,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8569,13 +8537,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -8629,40 +8597,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll @@ -10,17 +10,15 @@ define amdgpu_kernel void @global_workgroup_unordered_load( ; GFX6-LABEL: global_workgroup_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -31,11 +29,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_unordered_load: @@ -60,17 +58,15 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -104,17 +100,15 @@ define amdgpu_kernel void @global_workgroup_monotonic_load( ; GFX6-LABEL: global_workgroup_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -125,11 +119,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_monotonic_load: @@ -154,17 +148,15 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -198,17 +190,15 @@ define amdgpu_kernel void @global_workgroup_acquire_load( ; GFX6-LABEL: global_workgroup_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -219,11 +209,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_acquire_load: @@ -249,17 +239,15 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -294,18 +282,16 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; GFX6-LABEL: global_workgroup_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -317,11 +303,11 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_seq_cst_load: @@ -348,18 +334,16 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -405,12 +389,12 @@ ; ; GFX7-LABEL: global_workgroup_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -486,12 +470,12 @@ ; ; GFX7-LABEL: global_workgroup_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -568,12 +552,12 @@ ; ; GFX7-LABEL: global_workgroup_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -657,12 +641,12 @@ ; ; GFX7-LABEL: global_workgroup_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -734,12 +718,12 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; GFX6-LABEL: global_workgroup_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -778,19 +762,19 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -799,8 +783,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -815,12 +799,12 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; GFX6-LABEL: global_workgroup_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -861,19 +845,19 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -882,8 +866,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -900,12 +884,12 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; GFX6-LABEL: global_workgroup_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -949,20 +933,20 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -972,8 +956,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -989,12 +973,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX6-LABEL: global_workgroup_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -1040,20 +1024,20 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1063,8 +1047,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1082,12 +1066,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX6-LABEL: global_workgroup_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -1133,20 +1117,20 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1156,8 +1140,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1175,12 +1159,12 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; GFX6-LABEL: global_workgroup_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1228,21 +1212,21 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1253,8 +1237,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1273,12 +1257,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX6-LABEL: global_workgroup_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1331,22 +1315,22 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1358,8 +1342,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1379,12 +1363,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX6-LABEL: global_workgroup_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1437,22 +1421,22 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1464,8 +1448,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1485,13 +1469,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -1535,34 +1519,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -1574,13 +1558,13 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; GFX6-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -1626,34 +1610,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -1667,13 +1651,13 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; GFX6-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm @@ -1722,37 +1706,37 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -1764,13 +1748,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm @@ -1821,37 +1805,37 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -1865,13 +1849,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm @@ -1922,37 +1906,37 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -1966,13 +1950,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; GFX6-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -2018,34 +2002,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2059,13 +2043,13 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; GFX6-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -2111,34 +2095,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2152,13 +2136,13 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX6-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm @@ -2209,37 +2193,37 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2253,13 +2237,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm @@ -2310,37 +2294,37 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2354,13 +2338,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm @@ -2411,37 +2395,37 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2455,13 +2439,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm @@ -2512,37 +2496,37 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2556,13 +2540,13 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm @@ -2613,37 +2597,37 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2657,13 +2641,13 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX6-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm @@ -2714,37 +2698,37 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2758,13 +2742,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm @@ -2815,37 +2799,37 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2859,13 +2843,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm @@ -2916,37 +2900,37 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2960,13 +2944,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -3020,40 +3004,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3067,13 +3051,13 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -3128,41 +3112,41 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3176,13 +3160,13 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3241,43 +3225,43 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3291,13 +3275,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3357,44 +3341,44 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3408,13 +3392,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3474,44 +3458,44 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3525,13 +3509,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -3586,41 +3570,41 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3634,13 +3618,13 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -3695,41 +3679,41 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3743,13 +3727,13 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3809,44 +3793,44 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3860,13 +3844,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3926,44 +3910,44 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3977,13 +3961,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -4043,44 +4027,44 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4094,13 +4078,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -4160,44 +4144,44 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4211,13 +4195,13 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -4277,44 +4261,44 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4328,13 +4312,13 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -4394,44 +4378,44 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4445,13 +4429,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -4511,44 +4495,44 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4562,13 +4546,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -4628,44 +4612,44 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4679,17 +4663,15 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load( ; GFX6-LABEL: global_workgroup_one_as_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -4700,11 +4682,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_one_as_unordered_load: @@ -4729,17 +4711,15 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -4773,17 +4753,15 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( ; GFX6-LABEL: global_workgroup_one_as_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -4794,11 +4772,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_load: @@ -4823,17 +4801,15 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -4867,17 +4843,15 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load( ; GFX6-LABEL: global_workgroup_one_as_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -4888,11 +4862,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_load: @@ -4918,17 +4892,15 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -4963,17 +4935,15 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; GFX6-LABEL: global_workgroup_one_as_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -4984,11 +4954,11 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_load: @@ -5015,17 +4985,15 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -5071,12 +5039,12 @@ ; ; GFX7-LABEL: global_workgroup_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -5152,12 +5120,12 @@ ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -5233,12 +5201,12 @@ ; ; GFX7-LABEL: global_workgroup_one_as_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -5317,12 +5285,12 @@ ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -5390,12 +5358,12 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; GFX6-LABEL: global_workgroup_one_as_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -5434,19 +5402,19 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5455,8 +5423,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5471,12 +5439,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; GFX6-LABEL: global_workgroup_one_as_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -5517,19 +5485,19 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5538,8 +5506,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5556,12 +5524,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX6-LABEL: global_workgroup_one_as_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -5602,19 +5570,19 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5623,8 +5591,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5640,12 +5608,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX6-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -5688,19 +5656,19 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5709,8 +5677,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5728,12 +5696,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX6-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -5776,19 +5744,19 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5797,8 +5765,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5816,12 +5784,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; GFX6-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -5869,21 +5837,21 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5894,8 +5862,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5914,12 +5882,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX6-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -5969,21 +5937,21 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5994,8 +5962,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6015,12 +5983,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX6-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -6070,21 +6038,21 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6095,8 +6063,8 @@ ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6116,13 +6084,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6166,34 +6134,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6205,13 +6173,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6257,34 +6225,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -6298,13 +6266,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6350,35 +6318,35 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6390,13 +6358,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6444,35 +6412,35 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -6486,13 +6454,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6540,35 +6508,35 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -6582,13 +6550,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6634,34 +6602,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -6675,13 +6643,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6727,34 +6695,34 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -6768,13 +6736,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6822,35 +6790,35 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -6864,13 +6832,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6918,35 +6886,35 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -6960,13 +6928,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -7014,35 +6982,35 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -7056,13 +7024,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -7110,35 +7078,35 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -7152,13 +7120,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -7206,35 +7174,35 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -7248,13 +7216,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -7302,35 +7270,35 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -7344,13 +7312,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -7398,35 +7366,35 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -7440,13 +7408,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -7494,35 +7462,35 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -7536,13 +7504,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -7596,40 +7564,40 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -7643,13 +7611,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -7704,41 +7672,41 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -7752,13 +7720,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -7814,41 +7782,41 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -7862,13 +7830,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -7925,42 +7893,42 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -7974,13 +7942,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -8037,42 +8005,42 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8086,13 +8054,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -8147,41 +8115,41 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8195,13 +8163,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -8256,41 +8224,41 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8304,13 +8272,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -8367,42 +8335,42 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8416,13 +8384,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -8479,42 +8447,42 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8528,13 +8496,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -8591,42 +8559,42 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8640,13 +8608,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -8703,42 +8671,42 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8752,13 +8720,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -8815,42 +8783,42 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8864,13 +8832,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -8927,42 +8895,42 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8976,13 +8944,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -9039,42 +9007,42 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -9088,13 +9056,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -9151,42 +9119,42 @@ ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll @@ -374,12 +374,12 @@ define amdgpu_kernel void @local_agent_unordered_store( ; GFX6-LABEL: local_agent_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -447,12 +447,12 @@ define amdgpu_kernel void @local_agent_monotonic_store( ; GFX6-LABEL: local_agent_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -520,12 +520,12 @@ define amdgpu_kernel void @local_agent_release_store( ; GFX6-LABEL: local_agent_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -601,12 +601,12 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX6-LABEL: local_agent_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -4739,12 +4739,12 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store( ; GFX6-LABEL: local_agent_one_as_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -4812,12 +4812,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store( ; GFX6-LABEL: local_agent_one_as_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -4885,12 +4885,12 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX6-LABEL: local_agent_one_as_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -4958,12 +4958,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX6-LABEL: local_agent_one_as_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll @@ -10,14 +10,14 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; GFX6-LABEL: local_nontemporal_load_0: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -64,14 +64,14 @@ ; ; SKIP-CACHE-INV-LABEL: local_nontemporal_load_0: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -80,24 +80,24 @@ ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_nontemporal_load_0: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(1)* %out) { entry: @@ -109,13 +109,13 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX6-LABEL: local_nontemporal_load_1: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v0 ; GFX6-NEXT: ds_read_b32 v0, v0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -165,13 +165,13 @@ ; ; SKIP-CACHE-INV-LABEL: local_nontemporal_load_1: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s2, v0 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll @@ -362,12 +362,12 @@ define amdgpu_kernel void @local_singlethread_unordered_store( ; GFX6-LABEL: local_singlethread_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -435,12 +435,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_store( ; GFX6-LABEL: local_singlethread_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -508,12 +508,12 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX6-LABEL: local_singlethread_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -581,12 +581,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX6-LABEL: local_singlethread_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -4335,12 +4335,12 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store( ; GFX6-LABEL: local_singlethread_one_as_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -4408,12 +4408,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( ; GFX6-LABEL: local_singlethread_one_as_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -4481,12 +4481,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX6-LABEL: local_singlethread_one_as_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -4554,12 +4554,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX6-LABEL: local_singlethread_one_as_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll @@ -374,12 +374,12 @@ define amdgpu_kernel void @local_system_unordered_store( ; GFX6-LABEL: local_system_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -447,12 +447,12 @@ define amdgpu_kernel void @local_system_monotonic_store( ; GFX6-LABEL: local_system_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -520,12 +520,12 @@ define amdgpu_kernel void @local_system_release_store( ; GFX6-LABEL: local_system_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -601,12 +601,12 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; GFX6-LABEL: local_system_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -4739,12 +4739,12 @@ define amdgpu_kernel void @local_system_one_as_unordered_store( ; GFX6-LABEL: local_system_one_as_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -4812,12 +4812,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store( ; GFX6-LABEL: local_system_one_as_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -4885,12 +4885,12 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX6-LABEL: local_system_one_as_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -4958,12 +4958,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX6-LABEL: local_system_one_as_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll @@ -8,14 +8,14 @@ define amdgpu_kernel void @local_volatile_load_0( ; GFX6-LABEL: local_volatile_load_0: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -62,14 +62,14 @@ ; ; SKIP-CACHE-INV-LABEL: local_volatile_load_0: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -83,13 +83,13 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX6-LABEL: local_volatile_load_1: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v0 ; GFX6-NEXT: ds_read_b32 v0, v0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -139,13 +139,13 @@ ; ; SKIP-CACHE-INV-LABEL: local_volatile_load_1: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s2, v0 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -379,12 +379,12 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; GFX6-LABEL: local_volatile_workgroup_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xa +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll @@ -362,12 +362,12 @@ define amdgpu_kernel void @local_wavefront_unordered_store( ; GFX6-LABEL: local_wavefront_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -435,12 +435,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_store( ; GFX6-LABEL: local_wavefront_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -508,12 +508,12 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX6-LABEL: local_wavefront_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -581,12 +581,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX6-LABEL: local_wavefront_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -4335,12 +4335,12 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store( ; GFX6-LABEL: local_wavefront_one_as_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -4408,12 +4408,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( ; GFX6-LABEL: local_wavefront_one_as_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -4481,12 +4481,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX6-LABEL: local_wavefront_one_as_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -4554,12 +4554,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX6-LABEL: local_wavefront_one_as_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll @@ -374,12 +374,12 @@ define amdgpu_kernel void @local_workgroup_unordered_store( ; GFX6-LABEL: local_workgroup_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -447,12 +447,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_store( ; GFX6-LABEL: local_workgroup_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -520,12 +520,12 @@ define amdgpu_kernel void @local_workgroup_release_store( ; GFX6-LABEL: local_workgroup_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -601,12 +601,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; GFX6-LABEL: local_workgroup_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -4739,12 +4739,12 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store( ; GFX6-LABEL: local_workgroup_one_as_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -4812,12 +4812,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( ; GFX6-LABEL: local_workgroup_one_as_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -4885,12 +4885,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX6-LABEL: local_workgroup_one_as_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -4958,12 +4958,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX6-LABEL: local_workgroup_one_as_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll @@ -12,13 +12,13 @@ ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3] ; GFX6-NEXT: s_mov_b64 s[8:9], s[0:1] -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; GFX6-NEXT: s_add_u32 s8, s8, s7 ; GFX6-NEXT: s_addc_u32 s9, s9, 0 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -78,18 +78,18 @@ ; ; SKIP-CACHE-INV-LABEL: private_nontemporal_load_0: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[8:9] -; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[4:5] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: s_add_u32 s8, s8, s3 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s9, s9, 0 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s4, s3 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s5, 0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc slc ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -102,12 +102,12 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s8, s8, s7 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s9, s9, 0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen glc slc +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_0: @@ -118,12 +118,12 @@ ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_add_u32 s8, s8, s7 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s9, s9, 0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen glc slc +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(5)* %in, i32 addrspace(1)* %out) { entry: @@ -137,13 +137,13 @@ ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3] ; GFX6-NEXT: s_mov_b64 s[8:9], s[0:1] -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; GFX6-NEXT: s_add_u32 s8, s8, s7 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: s_addc_u32 s9, s9, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v0 ; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -205,19 +205,19 @@ ; ; SKIP-CACHE-INV-LABEL: private_nontemporal_load_1: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[8:9] -; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[4:5] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s4, v0 -; SKIP-CACHE-INV-NEXT: s_add_u32 s8, s8, s3 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s9, s9, 0 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc +; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s2, v0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s4, s3 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s5, 0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc slc ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -394,9 +394,9 @@ ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX6-NEXT: s_add_u32 s8, s8, s7 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX6-NEXT: s_addc_u32 s9, s9, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX6-NEXT: s_addc_u32 s9, s9, 0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s0 @@ -411,9 +411,9 @@ ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_add_u32 s8, s8, s7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX7-NEXT: s_addc_u32 s9, s9, 0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll @@ -8,17 +8,17 @@ define amdgpu_kernel void @private_volatile_load_0( ; GFX6-LABEL: private_volatile_load_0: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX6-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; GFX6-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s10, -1 -; GFX6-NEXT: s_mov_b32 s11, 0xe8f000 -; GFX6-NEXT: s_add_u32 s8, s8, s3 -; GFX6-NEXT: s_addc_u32 s9, s9, 0 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xe8f000 +; GFX6-NEXT: s_add_u32 s4, s4, s3 +; GFX6-NEXT: s_addc_u32 s5, s5, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -78,19 +78,19 @@ ; ; SKIP-CACHE-INV-LABEL: private_volatile_load_0: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[8:9] -; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[4:5] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: s_add_u32 s8, s8, s3 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s9, s9, 0 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s4, s3 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s5, 0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(5)* %in, i32 addrspace(1)* %out) { @@ -103,18 +103,18 @@ define amdgpu_kernel void @private_volatile_load_1( ; GFX6-LABEL: private_volatile_load_1: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX6-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; GFX6-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s10, -1 -; GFX6-NEXT: s_mov_b32 s11, 0xe8f000 -; GFX6-NEXT: s_add_u32 s8, s8, s3 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xe8f000 +; GFX6-NEXT: s_add_u32 s4, s4, s3 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX6-NEXT: s_addc_u32 s9, s9, 0 +; GFX6-NEXT: s_addc_u32 s5, s5, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0 -; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v0 +; GFX6-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -175,20 +175,20 @@ ; ; SKIP-CACHE-INV-LABEL: private_volatile_load_1: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[8:9] -; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[4:5] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s4, v0 -; SKIP-CACHE-INV-NEXT: s_add_u32 s8, s8, s3 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s9, s9, 0 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc +; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s2, v0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s4, s3 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s5, 0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(5)* %in, i32 addrspace(1)* %out) { @@ -327,9 +327,9 @@ ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_add_u32 s8, s8, s7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX7-NEXT: s_addc_u32 s9, s9, 0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -71,14 +71,15 @@ define amdgpu_kernel void @scalar_clause(<4 x i32> addrspace(1)* noalias nocapture readonly %arg, <4 x i32> addrspace(1)* noalias nocapture %arg1) { ; GCN-LABEL: scalar_clause: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 -; GCN-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x2c ; GCN-NEXT: v_mov_b32_e32 v12, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx4 s[0:3], s[16:17], 0x0 -; GCN-NEXT: s_load_dwordx4 s[4:7], s[16:17], 0x10 -; GCN-NEXT: s_load_dwordx4 s[8:11], s[16:17], 0x20 -; GCN-NEXT: s_load_dwordx4 s[12:15], s[16:17], 0x30 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[12:13], 0x0 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[12:13], 0x10 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x20 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x30 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 @@ -92,14 +93,14 @@ ; GCN-NEXT: v_mov_b32_e32 v9, s9 ; GCN-NEXT: v_mov_b32_e32 v10, s10 ; GCN-NEXT: v_mov_b32_e32 v11, s11 -; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[18:19] -; GCN-NEXT: global_store_dwordx4 v12, v[4:7], s[18:19] offset:16 -; GCN-NEXT: global_store_dwordx4 v12, v[8:11], s[18:19] offset:32 +; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GCN-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 +; GCN-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:32 ; GCN-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NEXT: v_mov_b32_e32 v1, s13 ; GCN-NEXT: v_mov_b32_e32 v2, s14 ; GCN-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[18:19] offset:48 +; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48 ; GCN-NEXT: s_endpgm ; ; GCN-SCRATCH-LABEL: scalar_clause: @@ -161,44 +162,56 @@ ; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GCN-NEXT: v_add_u32_e32 v0, v0, v2 -; GCN-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 -; GCN-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:8 -; GCN-NEXT: buffer_load_dword v6, v0, s[0:3], 0 offen offset:12 -; GCN-NEXT: buffer_load_dword v7, v0, s[0:3], 0 offen offset:16 -; GCN-NEXT: buffer_load_dword v8, v0, s[0:3], 0 offen offset:20 -; GCN-NEXT: buffer_load_dword v9, v0, s[0:3], 0 offen offset:24 -; GCN-NEXT: buffer_load_dword v10, v0, s[0:3], 0 offen offset:28 -; GCN-NEXT: buffer_load_dword v11, v0, s[0:3], 0 offen offset:32 -; GCN-NEXT: buffer_load_dword v12, v0, s[0:3], 0 offen offset:36 -; GCN-NEXT: buffer_load_dword v13, v0, s[0:3], 0 offen offset:40 -; GCN-NEXT: buffer_load_dword v14, v0, s[0:3], 0 offen offset:44 -; GCN-NEXT: buffer_load_dword v15, v0, s[0:3], 0 offen offset:48 -; GCN-NEXT: buffer_load_dword v16, v0, s[0:3], 0 offen offset:52 -; GCN-NEXT: buffer_load_dword v17, v0, s[0:3], 0 offen offset:56 +; GCN-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:12 +; GCN-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:8 +; GCN-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4 +; GCN-NEXT: buffer_load_dword v6, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v7, v0, s[0:3], 0 offen offset:28 +; GCN-NEXT: buffer_load_dword v8, v0, s[0:3], 0 offen offset:24 +; GCN-NEXT: buffer_load_dword v9, v0, s[0:3], 0 offen offset:20 +; GCN-NEXT: buffer_load_dword v10, v0, s[0:3], 0 offen offset:16 +; GCN-NEXT: buffer_load_dword v11, v0, s[0:3], 0 offen offset:44 +; GCN-NEXT: buffer_load_dword v12, v0, s[0:3], 0 offen offset:40 +; GCN-NEXT: buffer_load_dword v13, v0, s[0:3], 0 offen offset:36 +; GCN-NEXT: buffer_load_dword v14, v0, s[0:3], 0 offen offset:32 +; GCN-NEXT: buffer_load_dword v15, v0, s[0:3], 0 offen offset:60 +; GCN-NEXT: buffer_load_dword v16, v0, s[0:3], 0 offen offset:56 +; GCN-NEXT: buffer_load_dword v17, v0, s[0:3], 0 offen offset:52 ; GCN-NEXT: s_nop 0 -; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:60 +; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:48 ; GCN-NEXT: v_add_u32_e32 v1, v1, v2 -; GCN-NEXT: s_waitcnt vmcnt(12) -; GCN-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen offset:12 -; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen offset:8 -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen offset:4 -; GCN-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(12) -; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen offset:28 -; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen offset:24 -; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen offset:20 -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen offset:16 -; GCN-NEXT: s_waitcnt vmcnt(12) -; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen offset:44 -; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen offset:40 -; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen offset:36 -; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen offset:32 -; GCN-NEXT: s_waitcnt vmcnt(12) -; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:60 -; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen offset:56 -; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen offset:52 -; GCN-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen offset:48 +; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen offset:12 +; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen offset:8 +; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen offset:4 +; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen offset:28 +; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen offset:24 +; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen offset:20 +; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen offset:16 +; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen offset:44 +; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen offset:40 +; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen offset:36 +; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen offset:32 +; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen offset:60 +; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen offset:56 +; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen offset:52 +; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:48 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -85,10 +85,10 @@ ; SI: s_min_i32 ; SI: s_min_i32 -; VI: s_min_i32 -; VI: s_min_i32 -; VI: s_min_i32 -; VI: v_min_i32_sdwa +; VI-DAG: s_min_i32 +; VI-DAG: s_min_i32 +; VI-DAG: s_min_i32 +; VI-DAG: v_min_i32_sdwa ; GFX9_10: v_min_i16 ; GFX9_10: v_min_i16 diff --git a/llvm/test/CodeGen/AMDGPU/missing-store.ll b/llvm/test/CodeGen/AMDGPU/missing-store.ll --- a/llvm/test/CodeGen/AMDGPU/missing-store.ll +++ b/llvm/test/CodeGen/AMDGPU/missing-store.ll @@ -6,15 +6,14 @@ ; resulting in losing the store to gptr ; FUNC-LABEL: {{^}}missing_store_reduced: -; SI: s_load_dwordx4 -; SI: ds_read_b64 -; SI-DAG: buffer_store_dword +; SI: s_load_dwordx4 +; SI-DAG: ds_read_b64 ; SI-DAG: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}} -; SI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}} -; SI: s_nop 3 -; SI: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}} -; SI: buffer_store_dword -; SI: s_endpgm +; SI-DAG: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}} +; SI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}} +; SI-DAG: buffer_store_dword +; SI-DAG: buffer_store_dword +; SI: s_endpgm define amdgpu_kernel void @missing_store_reduced(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { %ptr0 = load i32 addrspace(4)*, i32 addrspace(4)* addrspace(3)* @ptr_load, align 8 %ptr2 = getelementptr inbounds i32, i32 addrspace(4)* %ptr0, i64 2 diff --git a/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll b/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll --- a/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll +++ b/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll @@ -6,12 +6,11 @@ ; FIXME: We should be able to use the SGPR directly as src0 to v_add_i32 ; GCN-LABEL: {{^}}clobber_vgpr_pair_pointer_add: -; GCN: s_load_dwordx2 s{{\[}}[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}} +; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}} +; GCN-DAG: s_load_dwordx2 s{{\[}}[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}} -; GCN-NOT: v_mov_b32 -; GCN: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}} -; GCN: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]] -; GCN: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]] +; GCN-DAG: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]] +; GCN-DAG: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]] ; GCN-NOT: v_mov_b32 ; GCN-NOT: v_mov_b32 diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll --- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll @@ -23,22 +23,22 @@ ; ; VI-LABEL: test_smul24_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_i32 s0, s0, 0x180000 -; VI-NEXT: s_bfe_i32 s1, s1, 0x180000 -; VI-NEXT: s_mul_i32 s0, s0, s1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_bfe_i32 s4, s4, 0x180000 +; VI-NEXT: s_bfe_i32 s5, s5, 0x180000 +; VI-NEXT: s_mul_i32 s4, s4, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_smul24_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -109,20 +109,20 @@ ; ; VI-LABEL: test_smulhi24_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s4, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_smulhi24_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -321,25 +321,25 @@ ; ; VI-LABEL: test_smul24_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s2, s[0:1], 0x4c -; VI-NEXT: s_load_dword s0, s[0:1], 0x70 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x4c +; VI-NEXT: s_load_dword s5, s[0:1], 0x70 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_i32 s1, s2, 0x180000 -; VI-NEXT: s_bfe_i32 s0, s0, 0x180000 -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s0, v0 -; VI-NEXT: v_mul_i32_i24_e32 v0, s0, v0 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_bfe_i32 s4, s4, 0x180000 +; VI-NEXT: s_bfe_i32 s5, s5, 0x180000 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s5, v0 +; VI-NEXT: v_mul_i32_i24_e32 v0, s5, v0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_smul24_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x4c ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x70 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -420,21 +420,21 @@ ; ; VI-LABEL: test_smul24_i64_square: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_i32 s0, s0, 0x180000 -; VI-NEXT: v_mul_hi_i32_i24_e64 v1, s0, s0 -; VI-NEXT: v_mul_i32_i24_e64 v0, s0, s0 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_bfe_i32 s4, s4, 0x180000 +; VI-NEXT: v_mul_hi_i32_i24_e64 v1, s4, s4 +; VI-NEXT: v_mul_i32_i24_e64 v0, s4, s4 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_smul24_i64_square: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -514,28 +514,28 @@ ; ; VI-LABEL: test_smul24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s1, s2, 8 -; VI-NEXT: s_lshl_b32 s3, s0, 8 +; VI-NEXT: s_lshl_b32 s3, s2, 8 +; VI-NEXT: s_lshl_b32 s5, s4, 8 +; VI-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 ; VI-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 -; VI-NEXT: s_ashr_i64 s[0:1], s[0:1], 40 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s0, v0 -; VI-NEXT: v_mul_i32_i24_e32 v0, s0, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s2, v0 +; VI-NEXT: v_mul_i32_i24_e32 v0, s2, v0 ; VI-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: v_ashrrev_i64 v[0:1], 31, v[0:1] -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_smul24_i33: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -624,22 +624,22 @@ ; ; VI-LABEL: test_smulhi24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dword s5, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s2, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s5, v0 ; VI-NEXT: v_and_b32_e32 v0, 1, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_smulhi24_i33: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -732,18 +732,18 @@ ; VI-NEXT: ; %bb.1: ; %bb7 ; VI-NEXT: s_endpgm ; VI-NEXT: .LBB8_2: ; %bb11 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s2, s[0:1], 0x34 -; VI-NEXT: s_load_dword s0, s[0:1], 0x3c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dword s5, s[0:1], 0x3c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_i32 s1, s2, 0x180000 -; VI-NEXT: s_bfe_i32 s0, s0, 0x180000 -; VI-NEXT: s_mul_i32 s1, s1, s0 -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_bfe_i32 s4, s4, 0x180000 +; VI-NEXT: s_bfe_i32 s5, s5, 0x180000 +; VI-NEXT: s_mul_i32 s4, s4, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: simplify_i24_crash: @@ -755,9 +755,9 @@ ; GFX9-NEXT: ; %bb.1: ; %bb7 ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .LBB8_2: ; %bb11 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll --- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll @@ -24,23 +24,23 @@ ; ; VI-LABEL: test_umul24_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s2, 0xffffff -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s6, 0xffffff +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s0, s0, s2 -; VI-NEXT: s_and_b32 s1, s1, s2 -; VI-NEXT: s_mul_i32 s0, s0, s1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_and_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s5, s5, s6 +; VI-NEXT: s_mul_i32 s4, s4, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_umul24_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s0, 0xffffff ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -78,22 +78,22 @@ ; ; VI-LABEL: test_umul24_i16_sext: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s1, s0, 16 -; VI-NEXT: s_mul_i32 s0, s0, s1 -; VI-NEXT: s_sext_i32_i16 s0, s0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_lshr_b32 s5, s4, 16 +; VI-NEXT: s_mul_i32 s4, s4, s5 +; VI-NEXT: s_sext_i32_i16 s4, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_umul24_i16_sext: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -136,22 +136,20 @@ ; ; VI-LABEL: test_umul24_i16_vgpr_sext: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v1 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v2, v[2:3] ; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_lo_u16_e32 v0, v2, v0 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 @@ -160,16 +158,14 @@ ; ; GFX9-LABEL: test_umul24_i16_vgpr_sext: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v3, v1, s[2:3] ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v3, v1, s[6:7] -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v0, v2, v3 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 @@ -204,22 +200,22 @@ ; ; VI-LABEL: test_umul24_i16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s1, s0, 16 -; VI-NEXT: s_mul_i32 s0, s0, s1 -; VI-NEXT: s_and_b32 s0, s0, 0xffff -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_lshr_b32 s5, s4, 16 +; VI-NEXT: s_mul_i32 s4, s4, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_umul24_i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -262,22 +258,20 @@ ; ; VI-LABEL: test_umul24_i16_vgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v1 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v2, v[2:3] ; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_lo_u16_e32 v0, v2, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -285,16 +279,14 @@ ; ; GFX9-LABEL: test_umul24_i16_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v3, v1, s[2:3] ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v3, v1, s[6:7] -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v0, v2, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -339,41 +331,37 @@ ; VI-LABEL: test_umul24_i8_vgpr: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v4, s9 -; VI-NEXT: v_add_u32_e32 v0, vcc, s8, v1 +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_lo_u16_e32 v0, v2, v0 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 8 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_umul24_i8_vgpr: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7] -; GFX9-NEXT: global_load_ubyte v3, v1, s[8:9] -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v0, v2, v3 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm entry: %tid.x = call i32 @llvm.amdgcn.workitem.id.x() @@ -403,20 +391,20 @@ ; ; VI-LABEL: test_umulhi24_i32_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mul_hi_u32_u24_e32 v0, s0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mul_hi_u32_u24_e32 v0, s4, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_umulhi24_i32_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s0, 0xffffff ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -475,7 +463,6 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s7, s[0:1], 0x34 -; GFX9-NEXT: ; kill: killed $sgpr0_sgpr1 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_mov_b32 s0, s4 @@ -541,7 +528,6 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s7, s[0:1], 0x34 -; GFX9-NEXT: ; kill: killed $sgpr0_sgpr1 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_mov_b32 s0, s4 @@ -614,20 +600,20 @@ ; ; VI-LABEL: test_umul24_i64_square: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x4c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mul_hi_u32_u24_e64 v1, s0, s0 -; VI-NEXT: v_mul_u32_u24_e64 v0, s0, s0 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: v_mul_hi_u32_u24_e64 v1, s4, s4 +; VI-NEXT: v_mul_u32_u24_e64 v0, s4, s4 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_umul24_i64_square: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x4c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -665,32 +651,32 @@ ; ; VI-LABEL: test_umulhi16_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s2, 0xffff -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s6, 0xffff +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s0, s0, s2 -; VI-NEXT: s_and_b32 s1, s1, s2 -; VI-NEXT: s_mul_i32 s0, s0, s1 -; VI-NEXT: s_lshr_b32 s0, s0, 16 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_and_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s5, s5, s6 +; VI-NEXT: s_mul_i32 s4, s4, s5 +; VI-NEXT: s_lshr_b32 s4, s4, 16 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_umulhi16_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s4, s0 -; GFX9-NEXT: s_and_b32 s0, s5, s0 +; GFX9-NEXT: s_and_b32 s1, s2, s0 +; GFX9-NEXT: s_and_b32 s0, s3, s0 ; GFX9-NEXT: s_mul_i32 s0, s1, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[2:3] +; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm entry: %a.16 = and i32 %a, 65535 @@ -724,17 +710,17 @@ ; ; VI-LABEL: test_umul24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dword s5, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mul_u32_u24_e32 v0, s2, v1 -; VI-NEXT: v_mul_hi_u32_u24_e32 v1, s2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mul_u32_u24_e32 v0, s5, v1 +; VI-NEXT: v_mul_hi_u32_u24_e32 v1, s5, v1 ; VI-NEXT: v_and_b32_e32 v1, 1, v1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_umul24_i33: @@ -783,16 +769,16 @@ ; ; VI-LABEL: test_umulhi24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dword s5, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mul_hi_u32_u24_e32 v0, s2, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mul_hi_u32_u24_e32 v0, s5, v0 ; VI-NEXT: v_and_b32_e32 v0, 1, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_umulhi24_i33: diff --git a/llvm/test/CodeGen/AMDGPU/operand-spacing.ll b/llvm/test/CodeGen/AMDGPU/operand-spacing.ll --- a/llvm/test/CodeGen/AMDGPU/operand-spacing.ll +++ b/llvm/test/CodeGen/AMDGPU/operand-spacing.ll @@ -9,8 +9,8 @@ ; SI: v_mov_b32_e32 [[VREGA:v[0-9]+]], [[SREGA]] ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGB]], [[VREGA]] -; VI: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c -; VI: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70 +; VI-DAG: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c +; VI-DAG: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70 ; VI: v_mov_b32_e32 [[VREGB:v[0-9]+]], [[SREGB]] ; VI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGA]], [[VREGB]] diff --git a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll --- a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll @@ -625,7 +625,7 @@ ; GCN-NOT: _or ; GCN: v_pk_add_f16 [[FADD:v[0-9]+]] -; GCN-NEXT: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[FADD]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}} +; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[FADD]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}} define amdgpu_kernel void @bitcast_lo_elt_op_sel(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { bb: %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/pr51516.mir b/llvm/test/CodeGen/AMDGPU/pr51516.mir --- a/llvm/test/CodeGen/AMDGPU/pr51516.mir +++ b/llvm/test/CodeGen/AMDGPU/pr51516.mir @@ -5,7 +5,7 @@ # GCN-LABEL: name: global_sextload_v32i32_to_v32i64 # GCN: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) -# GCN: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr27, killed renamable $vgpr19_vgpr20_vgpr21_vgpr22, killed renamable $sgpr0_sgpr1, 16, 0, implicit $exec, implicit killed renamable $vgpr0 +# GCN: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr20, killed renamable $vgpr24_vgpr25_vgpr26_vgpr27, killed renamable $sgpr0_sgpr1, 16, 0, implicit $exec, implicit killed renamable $vgpr0 --- name: global_sextload_v32i32_to_v32i64 diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -1,39 +1,304 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX900 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX90A %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX90A %s declare i64 @_Z13get_global_idj(i32) define amdgpu_kernel void @clmem_read_simplified(i8 addrspace(1)* %buffer) { -; GCN-LABEL: clmem_read_simplified: -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8-LABEL: clmem_read_simplified: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s38, -1 +; GFX8-NEXT: s_mov_b32 s39, 0xe80000 +; GFX8-NEXT: s_add_u32 s36, s36, s11 +; GFX8-NEXT: s_addc_u32 s37, s37, 0 +; GFX8-NEXT: s_getpc_b64 s[0:1] +; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: s_mov_b32 s32, 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s35 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s34, v0 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x800 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x1000 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x1800 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[11:12], v[0:1] +; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6] +; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[7:8] +; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10] +; GFX8-NEXT: s_movk_i32 s0, 0x2000 +; GFX8-NEXT: v_add_u32_e32 v13, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x2800 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[13:14] +; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[15:16] +; GFX8-NEXT: s_movk_i32 s0, 0x3000 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[17:18] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x3800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(6) +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v11 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v6, v12, vcc +; GFX8-NEXT: s_waitcnt vmcnt(5) +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v8, v5, vcc +; GFX8-NEXT: s_waitcnt vmcnt(4) +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v9, v2 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v10, v5, vcc +; GFX8-NEXT: s_waitcnt vmcnt(3) +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v13, v2 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v14, v5, vcc +; GFX8-NEXT: s_waitcnt vmcnt(2) +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v15, v2 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v16, v5, vcc +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v17, v2 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v18, v5, vcc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc +; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] +; GFX8-NEXT: s_endpgm ; -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX900-LABEL: clmem_read_simplified: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX900-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX900-NEXT: s_mov_b32 s38, -1 +; GFX900-NEXT: s_mov_b32 s39, 0xe00000 +; GFX900-NEXT: s_add_u32 s36, s36, s11 +; GFX900-NEXT: s_addc_u32 s37, s37, 0 +; GFX900-NEXT: s_getpc_b64 s[0:1] +; GFX900-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX900-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX900-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX900-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX900-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: s_mov_b32 s32, 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX900-NEXT: v_and_b32_e32 v1, 0xff, v0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, s35 +; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, s34, v0 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] +; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 +; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX900-NEXT: s_movk_i32 s1, 0x2000 +; GFX900-NEXT: global_load_dwordx2 v[5:6], v[0:1], off +; GFX900-NEXT: global_load_dwordx2 v[7:8], v[0:1], off offset:2048 +; GFX900-NEXT: v_add_co_u32_e32 v9, vcc, s1, v0 +; GFX900-NEXT: v_addc_co_u32_e32 v10, vcc, 0, v1, vcc +; GFX900-NEXT: global_load_dwordx2 v[11:12], v[9:10], off offset:-4096 +; GFX900-NEXT: s_movk_i32 s0, 0x1000 +; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, s0, v0 +; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v1, vcc +; GFX900-NEXT: global_load_dwordx2 v[15:16], v[13:14], off offset:2048 +; GFX900-NEXT: global_load_dwordx2 v[17:18], v[9:10], off +; GFX900-NEXT: global_load_dwordx2 v[19:20], v[9:10], off offset:2048 +; GFX900-NEXT: s_movk_i32 s0, 0x3000 +; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX900-NEXT: global_load_dwordx2 v[9:10], v[0:1], off +; GFX900-NEXT: global_load_dwordx2 v[13:14], v[0:1], off offset:2048 +; GFX900-NEXT: s_waitcnt vmcnt(6) +; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v7, v5 +; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v6, vcc +; GFX900-NEXT: s_waitcnt vmcnt(5) +; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v11, v0 +; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v12, v1, vcc +; GFX900-NEXT: s_waitcnt vmcnt(4) +; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v15, v0 +; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v16, v1, vcc +; GFX900-NEXT: s_waitcnt vmcnt(3) +; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v17, v0 +; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v18, v1, vcc +; GFX900-NEXT: s_waitcnt vmcnt(2) +; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v19, v0 +; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v20, v1, vcc +; GFX900-NEXT: s_waitcnt vmcnt(1) +; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v9, v0 +; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v10, v1, vcc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v13, v0 +; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v14, v1, vcc +; GFX900-NEXT: global_store_dwordx2 v[3:4], v[0:1], off +; GFX900-NEXT: s_endpgm ; -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} - +; GFX10-LABEL: clmem_read_simplified: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX10-NEXT: s_mov_b32 s38, -1 +; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX10-NEXT: s_add_u32 s36, s36, s11 +; GFX10-NEXT: s_addc_u32 s37, s37, 0 +; GFX10-NEXT: s_getpc_b64 s[0:1] +; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX10-NEXT: s_mov_b32 s32, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 7, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff8000, v2 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX10-NEXT: v_add_co_u32 v2, s0, s34, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s35, 0, s0 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, 0x1000 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v0, 0x2000 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX10-NEXT: global_load_dwordx2 v[8:9], v[4:5], off offset:-2048 +; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[12:13], v[4:5], off +; GFX10-NEXT: global_load_dwordx2 v[14:15], v[10:11], off offset:-2048 +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, 0x3000 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[16:17], v[10:11], off +; GFX10-NEXT: global_load_dwordx2 v[18:19], v[4:5], off offset:-2048 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x3800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[10:11], v[4:5], off +; GFX10-NEXT: global_load_dwordx2 v[20:21], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(6) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v8, v6 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v9, v7, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(5) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v12, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v13, v1, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(4) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v14, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v15, v1, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(3) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v16, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v17, v1, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v18, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v19, v1, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v10, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v11, v1, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v20, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v21, v1, vcc_lo +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_endpgm +; +; GFX90A-LABEL: clmem_read_simplified: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX90A-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX90A-NEXT: s_mov_b32 s38, -1 +; GFX90A-NEXT: s_mov_b32 s39, 0xe00000 +; GFX90A-NEXT: s_add_u32 s36, s36, s11 +; GFX90A-NEXT: s_addc_u32 s37, s37, 0 +; GFX90A-NEXT: s_getpc_b64 s[0:1] +; GFX90A-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX90A-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX90A-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_mov_b32 s32, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX90A-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s35 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s34, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 3, v[2:3] +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX90A-NEXT: s_movk_i32 s1, 0x2000 +; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX90A-NEXT: global_load_dwordx2 v[6:7], v[2:3], off offset:2048 +; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, s1, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v3, vcc +; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[8:9], off offset:-4096 +; GFX90A-NEXT: s_movk_i32 s0, 0x1000 +; GFX90A-NEXT: v_add_co_u32_e32 v12, vcc, s0, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v3, vcc +; GFX90A-NEXT: global_load_dwordx2 v[14:15], v[12:13], off offset:2048 +; GFX90A-NEXT: global_load_dwordx2 v[16:17], v[8:9], off +; GFX90A-NEXT: global_load_dwordx2 v[18:19], v[8:9], off offset:2048 +; GFX90A-NEXT: s_movk_i32 s0, 0x3000 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[2:3], off +; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[2:3], off offset:2048 +; GFX90A-NEXT: s_waitcnt vmcnt(6) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v6, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v5, vcc +; GFX90A-NEXT: s_waitcnt vmcnt(5) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v11, v3, vcc +; GFX90A-NEXT: s_waitcnt vmcnt(4) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v14, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v15, v3, vcc +; GFX90A-NEXT: s_waitcnt vmcnt(3) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v16, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v17, v3, vcc +; GFX90A-NEXT: s_waitcnt vmcnt(2) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v18, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v19, v3, vcc +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v9, v3, vcc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v12, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v13, v3, vcc +; GFX90A-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX90A-NEXT: s_endpgm entry: %call = tail call i64 @_Z13get_global_idj(i32 0) %conv = and i64 %call, 255 @@ -74,55 +339,476 @@ } define hidden amdgpu_kernel void @clmem_read(i8 addrspace(1)* %buffer) { -; GCN-LABEL: clmem_read: -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8-LABEL: clmem_read: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s38, -1 +; GFX8-NEXT: s_mov_b32 s39, 0xe80000 +; GFX8-NEXT: s_add_u32 s36, s36, s11 +; GFX8-NEXT: s_addc_u32 s37, s37, 0 +; GFX8-NEXT: s_getpc_b64 s[0:1] +; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: s_mov_b32 s32, 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 17, v0 +; GFX8-NEXT: v_and_b32_e32 v4, 0xfe000000, v0 +; GFX8-NEXT: v_lshlrev_b64 v[2:3], 3, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v4 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s34, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x5000 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, 0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: v_mov_b32_e32 v7, 0 +; GFX8-NEXT: s_movk_i32 s0, 0x7f +; GFX8-NEXT: .LBB1_1: ; %for.cond.preheader +; GFX8-NEXT: ; =>This Loop Header: Depth=1 +; GFX8-NEXT: ; Child Loop BB1_2 Depth 2 +; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: s_mov_b32 s1, 0 +; GFX8-NEXT: .LBB1_2: ; %for.body +; GFX8-NEXT: ; Parent Loop BB1_1 Depth=1 +; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0xffffb000, v4 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, -1, v5, vcc +; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[8:9] +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0xffffb800, v4 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, -1, v5, vcc +; GFX8-NEXT: flat_load_dwordx2 v[10:11], v[10:11] +; GFX8-NEXT: v_add_u32_e32 v12, vcc, 0xffffc000, v4 +; GFX8-NEXT: v_addc_u32_e32 v13, vcc, -1, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xffffc800, v4 +; GFX8-NEXT: v_addc_u32_e32 v15, vcc, -1, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xffffd000, v4 +; GFX8-NEXT: flat_load_dwordx2 v[12:13], v[12:13] +; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[14:15] +; GFX8-NEXT: v_addc_u32_e32 v17, vcc, -1, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0xffffd800, v4 +; GFX8-NEXT: v_addc_u32_e32 v19, vcc, -1, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v20, vcc, 0xffffe000, v4 +; GFX8-NEXT: flat_load_dwordx2 v[16:17], v[16:17] +; GFX8-NEXT: flat_load_dwordx2 v[18:19], v[18:19] +; GFX8-NEXT: v_addc_u32_e32 v21, vcc, -1, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v22, vcc, 0xffffe800, v4 +; GFX8-NEXT: v_addc_u32_e32 v23, vcc, -1, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v24, vcc, 0xfffff000, v4 +; GFX8-NEXT: flat_load_dwordx2 v[20:21], v[20:21] +; GFX8-NEXT: flat_load_dwordx2 v[22:23], v[22:23] +; GFX8-NEXT: v_addc_u32_e32 v25, vcc, -1, v5, vcc +; GFX8-NEXT: s_addk_i32 s1, 0x2000 +; GFX8-NEXT: s_cmp_gt_u32 s1, 0x3fffff +; GFX8-NEXT: s_waitcnt vmcnt(7) +; GFX8-NEXT: v_add_u32_e32 v26, vcc, v8, v6 +; GFX8-NEXT: v_addc_u32_e32 v27, vcc, v9, v7, vcc +; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[24:25] +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xfffff800, v4 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v5, vcc +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7] +; GFX8-NEXT: flat_load_dwordx2 v[24:25], v[4:5] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x10000, v4 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GFX8-NEXT: s_waitcnt vmcnt(9) +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v26 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v11, v27, vcc +; GFX8-NEXT: s_waitcnt vmcnt(8) +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v12, v10 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v13, v11, vcc +; GFX8-NEXT: s_waitcnt vmcnt(7) +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v14, v10 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v15, v11, vcc +; GFX8-NEXT: s_waitcnt vmcnt(6) +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v16, v10 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v17, v11, vcc +; GFX8-NEXT: s_waitcnt vmcnt(5) +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v18, v10 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v19, v11, vcc +; GFX8-NEXT: s_waitcnt vmcnt(4) +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v20, v10 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v21, v11, vcc +; GFX8-NEXT: s_waitcnt vmcnt(3) +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v22, v10 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v23, v11, vcc +; GFX8-NEXT: s_waitcnt vmcnt(2) +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v10 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v24, v6 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v25, v7, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB1_2 +; GFX8-NEXT: ; %bb.3: ; %while.cond.loopexit +; GFX8-NEXT: ; in Loop: Header=BB1_1 Depth=1 +; GFX8-NEXT: s_add_i32 s1, s0, -1 +; GFX8-NEXT: s_cmp_eq_u32 s0, 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB1_5 +; GFX8-NEXT: ; %bb.4: ; in Loop: Header=BB1_1 Depth=1 +; GFX8-NEXT: s_mov_b32 s0, s1 +; GFX8-NEXT: s_branch .LBB1_1 +; GFX8-NEXT: .LBB1_5: ; %while.end +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[6:7] +; GFX8-NEXT: s_endpgm ; -; GFX900: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 -; GFX900: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX900: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX900: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX900: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX900: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX900: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX900: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX900: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 -; GFX900: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX900: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX900-LABEL: clmem_read: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX900-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX900-NEXT: s_mov_b32 s38, -1 +; GFX900-NEXT: s_mov_b32 s39, 0xe00000 +; GFX900-NEXT: s_add_u32 s36, s36, s11 +; GFX900-NEXT: s_addc_u32 s37, s37, 0 +; GFX900-NEXT: s_getpc_b64 s[0:1] +; GFX900-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX900-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX900-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX900-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX900-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: s_mov_b32 s32, 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX900-NEXT: v_and_b32_e32 v1, 0xff, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 17, v0 +; GFX900-NEXT: v_and_b32_e32 v4, 0xfe000000, v0 +; GFX900-NEXT: v_lshlrev_b64 v[2:3], 3, v[1:2] +; GFX900-NEXT: v_mov_b32_e32 v5, s35 +; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, s34, v4 +; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc +; GFX900-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, s34, v2 +; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GFX900-NEXT: s_movk_i32 s0, 0x5000 +; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX900-NEXT: s_movk_i32 s4, 0x7f +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: s_movk_i32 s2, 0xd000 +; GFX900-NEXT: s_movk_i32 s3, 0xe000 +; GFX900-NEXT: s_movk_i32 s5, 0xf000 +; GFX900-NEXT: .LBB1_1: ; %for.cond.preheader +; GFX900-NEXT: ; =>This Loop Header: Depth=1 +; GFX900-NEXT: ; Child Loop BB1_2 Depth 2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: s_mov_b32 s6, 0 +; GFX900-NEXT: .LBB1_2: ; %for.body +; GFX900-NEXT: ; Parent Loop BB1_1 Depth=1 +; GFX900-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, 0xffffb000, v4 +; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v5, vcc +; GFX900-NEXT: global_load_dwordx2 v[10:11], v[4:5], off offset:-4096 +; GFX900-NEXT: global_load_dwordx2 v[12:13], v[4:5], off offset:-2048 +; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v4 +; GFX900-NEXT: global_load_dwordx2 v[8:9], v[8:9], off +; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v5, vcc +; GFX900-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048 +; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, s2, v4 +; GFX900-NEXT: v_addc_co_u32_e32 v17, vcc, -1, v5, vcc +; GFX900-NEXT: global_load_dwordx2 v[22:23], v[14:15], off +; GFX900-NEXT: global_load_dwordx2 v[24:25], v[16:17], off offset:-2048 +; GFX900-NEXT: v_add_co_u32_e32 v20, vcc, s3, v4 +; GFX900-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v5, vcc +; GFX900-NEXT: global_load_dwordx2 v[16:17], v[20:21], off offset:-4096 +; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, s5, v4 +; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v5, vcc +; GFX900-NEXT: s_addk_i32 s6, 0x2000 +; GFX900-NEXT: s_cmp_gt_u32 s6, 0x3fffff +; GFX900-NEXT: s_waitcnt vmcnt(4) +; GFX900-NEXT: v_add_co_u32_e64 v28, s[0:1], v8, v6 +; GFX900-NEXT: v_addc_co_u32_e64 v29, s[0:1], v9, v7, s[0:1] +; GFX900-NEXT: global_load_dwordx2 v[6:7], v[20:21], off offset:-2048 +; GFX900-NEXT: global_load_dwordx2 v[8:9], v[20:21], off +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: global_load_dwordx2 v[20:21], v[14:15], off offset:-2048 +; GFX900-NEXT: global_load_dwordx2 v[26:27], v[4:5], off +; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, 0x10000, v4 +; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX900-NEXT: s_waitcnt vmcnt(7) +; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v18, v28 +; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v19, v29, vcc +; GFX900-NEXT: s_waitcnt vmcnt(6) +; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v22, v14 +; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v23, v15, vcc +; GFX900-NEXT: s_waitcnt vmcnt(5) +; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v24, v14 +; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v25, v15, vcc +; GFX900-NEXT: s_waitcnt vmcnt(4) +; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v16, v14 +; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v17, v15, vcc +; GFX900-NEXT: s_waitcnt vmcnt(3) +; GFX900-NEXT: v_add_co_u32_e32 v6, vcc, v6, v14 +; GFX900-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v15, vcc +; GFX900-NEXT: s_waitcnt vmcnt(2) +; GFX900-NEXT: v_add_co_u32_e32 v6, vcc, v8, v6 +; GFX900-NEXT: v_addc_co_u32_e32 v7, vcc, v9, v7, vcc +; GFX900-NEXT: s_waitcnt vmcnt(1) +; GFX900-NEXT: v_add_co_u32_e32 v6, vcc, v20, v6 +; GFX900-NEXT: v_addc_co_u32_e32 v7, vcc, v21, v7, vcc +; GFX900-NEXT: v_add_co_u32_e32 v6, vcc, v10, v6 +; GFX900-NEXT: v_addc_co_u32_e32 v7, vcc, v11, v7, vcc +; GFX900-NEXT: v_add_co_u32_e32 v6, vcc, v12, v6 +; GFX900-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v7, vcc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_co_u32_e32 v6, vcc, v26, v6 +; GFX900-NEXT: v_addc_co_u32_e32 v7, vcc, v27, v7, vcc +; GFX900-NEXT: s_cbranch_scc0 .LBB1_2 +; GFX900-NEXT: ; %bb.3: ; %while.cond.loopexit +; GFX900-NEXT: ; in Loop: Header=BB1_1 Depth=1 +; GFX900-NEXT: s_add_i32 s0, s4, -1 +; GFX900-NEXT: s_cmp_eq_u32 s4, 0 +; GFX900-NEXT: s_cbranch_scc1 .LBB1_5 +; GFX900-NEXT: ; %bb.4: ; in Loop: Header=BB1_1 Depth=1 +; GFX900-NEXT: s_mov_b32 s4, s0 +; GFX900-NEXT: s_branch .LBB1_1 +; GFX900-NEXT: .LBB1_5: ; %while.end +; GFX900-NEXT: global_store_dwordx2 v[0:1], v[6:7], off +; GFX900-NEXT: s_endpgm ; -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} - -; GFX90A: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 -; GFX90A: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX90A: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX90A: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX90A: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX90A: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX90A: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX90A: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX90A: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 -; GFX90A: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX90A: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} - +; GFX10-LABEL: clmem_read: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX10-NEXT: s_mov_b32 s38, -1 +; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX10-NEXT: s_add_u32 s36, s36, s11 +; GFX10-NEXT: s_addc_u32 s37, s37, 0 +; GFX10-NEXT: s_getpc_b64 s[0:1] +; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX10-NEXT: s_mov_b32 s32, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 17, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: s_movk_i32 s1, 0x7f +; GFX10-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] +; GFX10-NEXT: v_and_b32_e32 v2, 0xfe000000, v3 +; GFX10-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v0, s34 +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, s35, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, s0, s34, v2 +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0x5000, v3 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s35, 0, s0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v6, vcc_lo +; GFX10-NEXT: .LBB1_1: ; %for.cond.preheader +; GFX10-NEXT: ; =>This Loop Header: Depth=1 +; GFX10-NEXT: ; Child Loop BB1_2 Depth 2 +; GFX10-NEXT: v_mov_b32_e32 v7, v3 +; GFX10-NEXT: v_mov_b32_e32 v6, v2 +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: .LBB1_2: ; %for.body +; GFX10-NEXT: ; Parent Loop BB1_1 Depth=1 +; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v6, 0xffffb800 +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, -1, v7, vcc_lo +; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v6, 0xffffc800 +; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, -1, v7, vcc_lo +; GFX10-NEXT: v_add_co_u32 v14, vcc_lo, v6, 0xffffd800 +; GFX10-NEXT: v_add_co_ci_u32_e32 v15, vcc_lo, -1, v7, vcc_lo +; GFX10-NEXT: v_add_co_u32 v18, vcc_lo, v6, 0xffffe800 +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: global_load_dwordx2 v[12:13], v[8:9], off offset:-2048 +; GFX10-NEXT: global_load_dwordx2 v[16:17], v[10:11], off offset:-2048 +; GFX10-NEXT: global_load_dwordx2 v[20:21], v[14:15], off offset:-2048 +; GFX10-NEXT: v_add_co_ci_u32_e32 v19, vcc_lo, -1, v7, vcc_lo +; GFX10-NEXT: v_add_co_u32 v22, vcc_lo, 0xfffff000, v6 +; GFX10-NEXT: v_add_co_ci_u32_e32 v23, vcc_lo, -1, v7, vcc_lo +; GFX10-NEXT: s_clause 0x7 +; GFX10-NEXT: global_load_dwordx2 v[24:25], v[18:19], off offset:-2048 +; GFX10-NEXT: global_load_dwordx2 v[8:9], v[8:9], off +; GFX10-NEXT: global_load_dwordx2 v[10:11], v[10:11], off +; GFX10-NEXT: global_load_dwordx2 v[14:15], v[14:15], off +; GFX10-NEXT: global_load_dwordx2 v[26:27], v[18:19], off +; GFX10-NEXT: global_load_dwordx2 v[28:29], v[22:23], off +; GFX10-NEXT: global_load_dwordx2 v[30:31], v[6:7], off offset:-2048 +; GFX10-NEXT: global_load_dwordx2 v[32:33], v[6:7], off +; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, 0x10000, v6 +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo +; GFX10-NEXT: s_addk_i32 s2, 0x2000 +; GFX10-NEXT: s_cmp_gt_u32 s2, 0x3fffff +; GFX10-NEXT: s_waitcnt vmcnt(10) +; GFX10-NEXT: v_add_co_u32 v4, s0, v12, v4 +; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s0, v13, v5, s0 +; GFX10-NEXT: s_waitcnt vmcnt(6) +; GFX10-NEXT: v_add_co_u32 v4, s0, v8, v4 +; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s0, v9, v5, s0 +; GFX10-NEXT: v_add_co_u32 v4, s0, v16, v4 +; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s0, v17, v5, s0 +; GFX10-NEXT: s_waitcnt vmcnt(5) +; GFX10-NEXT: v_add_co_u32 v4, s0, v10, v4 +; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s0, v11, v5, s0 +; GFX10-NEXT: v_add_co_u32 v4, s0, v20, v4 +; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s0, v21, v5, s0 +; GFX10-NEXT: s_waitcnt vmcnt(4) +; GFX10-NEXT: v_add_co_u32 v4, s0, v14, v4 +; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s0, v15, v5, s0 +; GFX10-NEXT: v_add_co_u32 v4, s0, v24, v4 +; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s0, v25, v5, s0 +; GFX10-NEXT: s_waitcnt vmcnt(3) +; GFX10-NEXT: v_add_co_u32 v4, s0, v26, v4 +; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s0, v27, v5, s0 +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: v_add_co_u32 v4, s0, v28, v4 +; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s0, v29, v5, s0 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_add_co_u32 v4, s0, v30, v4 +; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s0, v31, v5, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v32, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v33, v5, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB1_2 +; GFX10-NEXT: ; %bb.3: ; %while.cond.loopexit +; GFX10-NEXT: ; in Loop: Header=BB1_1 Depth=1 +; GFX10-NEXT: s_add_i32 s0, s1, -1 +; GFX10-NEXT: s_cmp_eq_u32 s1, 0 +; GFX10-NEXT: s_cbranch_scc1 .LBB1_5 +; GFX10-NEXT: ; %bb.4: ; in Loop: Header=BB1_1 Depth=1 +; GFX10-NEXT: s_mov_b32 s1, s0 +; GFX10-NEXT: s_branch .LBB1_1 +; GFX10-NEXT: .LBB1_5: ; %while.end +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[4:5], off +; GFX10-NEXT: s_endpgm +; +; GFX90A-LABEL: clmem_read: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX90A-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX90A-NEXT: s_mov_b32 s38, -1 +; GFX90A-NEXT: s_mov_b32 s39, 0xe00000 +; GFX90A-NEXT: s_add_u32 s36, s36, s11 +; GFX90A-NEXT: s_addc_u32 s37, s37, 0 +; GFX90A-NEXT: s_getpc_b64 s[0:1] +; GFX90A-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX90A-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX90A-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_mov_b32 s32, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX90A-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 17, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xfe000000, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, s35 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s34, v4 +; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 3, v[2:3] +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc +; GFX90A-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, s34, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GFX90A-NEXT: s_movk_i32 s0, 0x5000 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX90A-NEXT: s_movk_i32 s2, 0x7f +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], 0, 0 +; GFX90A-NEXT: s_movk_i32 s0, 0xd000 +; GFX90A-NEXT: s_movk_i32 s1, 0xe000 +; GFX90A-NEXT: s_movk_i32 s3, 0xf000 +; GFX90A-NEXT: .LBB1_1: ; %for.cond.preheader +; GFX90A-NEXT: ; =>This Loop Header: Depth=1 +; GFX90A-NEXT: ; Child Loop BB1_2 Depth 2 +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_mov_b32 s4, 0 +; GFX90A-NEXT: .LBB1_2: ; %for.body +; GFX90A-NEXT: ; Parent Loop BB1_1 Depth=1 +; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, 0xffffb000, v6 +; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v7, vcc +; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[6:7], off offset:-4096 +; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[6:7], off offset:-2048 +; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v6 +; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[8:9], off +; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v7, vcc +; GFX90A-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048 +; GFX90A-NEXT: v_add_co_u32_e32 v16, vcc, s0, v6 +; GFX90A-NEXT: v_addc_co_u32_e32 v17, vcc, -1, v7, vcc +; GFX90A-NEXT: global_load_dwordx2 v[16:17], v[16:17], off offset:-2048 +; GFX90A-NEXT: v_add_co_u32_e32 v20, vcc, s1, v6 +; GFX90A-NEXT: global_load_dwordx2 v[14:15], v[14:15], off +; GFX90A-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v7, vcc +; GFX90A-NEXT: global_load_dwordx2 v[24:25], v[20:21], off offset:-4096 +; GFX90A-NEXT: global_load_dwordx2 v[26:27], v[20:21], off offset:-2048 +; GFX90A-NEXT: global_load_dwordx2 v[28:29], v[20:21], off +; GFX90A-NEXT: v_add_co_u32_e32 v22, vcc, s3, v6 +; GFX90A-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v7, vcc +; GFX90A-NEXT: global_load_dwordx2 v[20:21], v[22:23], off offset:-2048 +; GFX90A-NEXT: global_load_dwordx2 v[30:31], v[6:7], off +; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x10000, v6 +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX90A-NEXT: s_addk_i32 s4, 0x2000 +; GFX90A-NEXT: s_cmp_gt_u32 s4, 0x3fffff +; GFX90A-NEXT: s_waitcnt vmcnt(8) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc +; GFX90A-NEXT: s_waitcnt vmcnt(7) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v18, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v19, v5, vcc +; GFX90A-NEXT: s_waitcnt vmcnt(5) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v14, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v15, v5, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v16, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v17, v5, vcc +; GFX90A-NEXT: s_waitcnt vmcnt(4) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v24, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v25, v5, vcc +; GFX90A-NEXT: s_waitcnt vmcnt(3) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v26, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v27, v5, vcc +; GFX90A-NEXT: s_waitcnt vmcnt(2) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v28, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v29, v5, vcc +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v20, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v21, v5, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v10, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v11, v5, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v13, v5, vcc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v30, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v31, v5, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB1_2 +; GFX90A-NEXT: ; %bb.3: ; %while.cond.loopexit +; GFX90A-NEXT: ; in Loop: Header=BB1_1 Depth=1 +; GFX90A-NEXT: s_add_i32 s4, s2, -1 +; GFX90A-NEXT: s_cmp_eq_u32 s2, 0 +; GFX90A-NEXT: s_cbranch_scc1 .LBB1_5 +; GFX90A-NEXT: ; %bb.4: ; in Loop: Header=BB1_1 Depth=1 +; GFX90A-NEXT: s_mov_b32 s2, s4 +; GFX90A-NEXT: s_branch .LBB1_1 +; GFX90A-NEXT: .LBB1_5: ; %while.end +; GFX90A-NEXT: global_store_dwordx2 v[0:1], v[4:5], off +; GFX90A-NEXT: s_endpgm entry: %call = tail call i64 @_Z13get_global_idj(i32 0) %conv = and i64 %call, 255 @@ -222,39 +908,271 @@ ; using 32bit address. define amdgpu_kernel void @Address32(i8 addrspace(1)* %buffer) { -; GCN-LABEL: Address32: -; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; GFX8-LABEL: Address32: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s38, -1 +; GFX8-NEXT: s_mov_b32 s39, 0xe80000 +; GFX8-NEXT: s_add_u32 s36, s36, s11 +; GFX8-NEXT: s_addc_u32 s37, s37, 0 +; GFX8-NEXT: s_getpc_b64 s[0:1] +; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: s_mov_b32 s32, 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s35 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s34, v0 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[1:2] +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x400 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x800 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0xc00 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x1000 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x1400 +; GFX8-NEXT: v_add_u32_e32 v13, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x1800 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x1c00 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x2000 +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: flat_load_dword v19, v[5:6] +; GFX8-NEXT: flat_load_dword v7, v[7:8] +; GFX8-NEXT: flat_load_dword v8, v[9:10] +; GFX8-NEXT: flat_load_dword v9, v[11:12] +; GFX8-NEXT: flat_load_dword v10, v[13:14] +; GFX8-NEXT: flat_load_dword v11, v[15:16] +; GFX8-NEXT: flat_load_dword v12, v[17:18] +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x2400, v0 +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(8) +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v19, v2 +; GFX8-NEXT: s_waitcnt vmcnt(7) +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v7, v1 +; GFX8-NEXT: s_waitcnt vmcnt(6) +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v8, v1 +; GFX8-NEXT: s_waitcnt vmcnt(5) +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v9, v1 +; GFX8-NEXT: s_waitcnt vmcnt(4) +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v10, v1 +; GFX8-NEXT: s_waitcnt vmcnt(3) +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v11, v1 +; GFX8-NEXT: s_waitcnt vmcnt(2) +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v12, v1 +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v5, v1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: flat_store_dword v[3:4], v0 +; GFX8-NEXT: s_endpgm +; +; GFX900-LABEL: Address32: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX900-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX900-NEXT: s_mov_b32 s38, -1 +; GFX900-NEXT: s_mov_b32 s39, 0xe00000 +; GFX900-NEXT: s_add_u32 s36, s36, s11 +; GFX900-NEXT: s_addc_u32 s37, s37, 0 +; GFX900-NEXT: s_getpc_b64 s[0:1] +; GFX900-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX900-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX900-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX900-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX900-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: s_mov_b32 s32, 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX900-NEXT: v_and_b32_e32 v1, 0xff, v0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, s35 +; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, s34, v0 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], 2, v[1:2] +; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 +; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX900-NEXT: s_movk_i32 s0, 0x1000 +; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, s0, v0 +; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc +; GFX900-NEXT: global_load_dword v2, v[0:1], off +; GFX900-NEXT: global_load_dword v7, v[0:1], off offset:1024 +; GFX900-NEXT: global_load_dword v8, v[0:1], off offset:2048 +; GFX900-NEXT: global_load_dword v9, v[0:1], off offset:3072 +; GFX900-NEXT: global_load_dword v10, v[5:6], off +; GFX900-NEXT: global_load_dword v11, v[5:6], off offset:1024 +; GFX900-NEXT: global_load_dword v12, v[5:6], off offset:2048 +; GFX900-NEXT: global_load_dword v13, v[5:6], off offset:3072 +; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 +; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX900-NEXT: global_load_dword v5, v[0:1], off +; GFX900-NEXT: global_load_dword v6, v[0:1], off offset:1024 +; GFX900-NEXT: s_waitcnt vmcnt(8) +; GFX900-NEXT: v_add_u32_e32 v0, v7, v2 +; GFX900-NEXT: s_waitcnt vmcnt(6) +; GFX900-NEXT: v_add3_u32 v0, v8, v0, v9 +; GFX900-NEXT: s_waitcnt vmcnt(4) +; GFX900-NEXT: v_add3_u32 v0, v10, v0, v11 +; GFX900-NEXT: s_waitcnt vmcnt(2) +; GFX900-NEXT: v_add3_u32 v0, v12, v0, v13 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add3_u32 v0, v5, v0, v6 +; GFX900-NEXT: global_store_dword v[3:4], v0, off +; GFX900-NEXT: s_endpgm ; -; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 -; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072 -; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 -; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072 -; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 +; GFX10-LABEL: Address32: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX10-NEXT: s_mov_b32 s38, -1 +; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX10-NEXT: s_add_u32 s36, s36, s11 +; GFX10-NEXT: s_addc_u32 s37, s37, 0 +; GFX10-NEXT: s_getpc_b64 s[0:1] +; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX10-NEXT: s_mov_b32 s32, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 7, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff8000, v2 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX10-NEXT: v_add_co_u32 v2, s0, s34, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s35, 0, s0 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v0, 0x1000 +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, 0x1000, v0 +; GFX10-NEXT: s_clause 0x4 +; GFX10-NEXT: global_load_dword v10, v[0:1], off +; GFX10-NEXT: global_load_dword v11, v[0:1], off offset:1024 +; GFX10-NEXT: global_load_dword v12, v[4:5], off offset:1024 +; GFX10-NEXT: global_load_dword v13, v[6:7], off offset:-2048 +; GFX10-NEXT: global_load_dword v14, v[6:7], off +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x1800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v0, 0x2000 +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dword v15, v[8:9], off offset:1024 +; GFX10-NEXT: global_load_dword v16, v[4:5], off offset:1024 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: global_load_dword v4, v[6:7], off offset:-2048 +; GFX10-NEXT: global_load_dword v5, v[6:7], off +; GFX10-NEXT: global_load_dword v8, v[0:1], off offset:1024 +; GFX10-NEXT: s_waitcnt vmcnt(8) +; GFX10-NEXT: v_add_nc_u32_e32 v0, v11, v10 +; GFX10-NEXT: s_waitcnt vmcnt(6) +; GFX10-NEXT: v_add3_u32 v0, v13, v0, v12 +; GFX10-NEXT: s_waitcnt vmcnt(4) +; GFX10-NEXT: v_add3_u32 v0, v14, v0, v15 +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: v_add3_u32 v0, v4, v0, v16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add3_u32 v0, v5, v0, v8 +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_endpgm ; -; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 -; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 -; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 -; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 -; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 +; GFX90A-LABEL: Address32: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX90A-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX90A-NEXT: s_mov_b32 s38, -1 +; GFX90A-NEXT: s_mov_b32 s39, 0xe00000 +; GFX90A-NEXT: s_add_u32 s36, s36, s11 +; GFX90A-NEXT: s_addc_u32 s37, s37, 0 +; GFX90A-NEXT: s_getpc_b64 s[0:1] +; GFX90A-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX90A-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX90A-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_mov_b32 s32, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX90A-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s35 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s34, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX90A-NEXT: s_movk_i32 s0, 0x1000 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s0, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v3, vcc +; GFX90A-NEXT: global_load_dword v6, v[2:3], off +; GFX90A-NEXT: global_load_dword v7, v[2:3], off offset:1024 +; GFX90A-NEXT: global_load_dword v8, v[2:3], off offset:2048 +; GFX90A-NEXT: global_load_dword v9, v[2:3], off offset:3072 +; GFX90A-NEXT: global_load_dword v10, v[4:5], off +; GFX90A-NEXT: global_load_dword v11, v[4:5], off offset:1024 +; GFX90A-NEXT: global_load_dword v12, v[4:5], off offset:2048 +; GFX90A-NEXT: global_load_dword v13, v[4:5], off offset:3072 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x2000, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX90A-NEXT: global_load_dword v4, v[2:3], off +; GFX90A-NEXT: global_load_dword v5, v[2:3], off offset:1024 +; GFX90A-NEXT: s_waitcnt vmcnt(8) +; GFX90A-NEXT: v_add_u32_e32 v2, v7, v6 +; GFX90A-NEXT: s_waitcnt vmcnt(6) +; GFX90A-NEXT: v_add3_u32 v2, v8, v2, v9 +; GFX90A-NEXT: s_waitcnt vmcnt(4) +; GFX90A-NEXT: v_add3_u32 v2, v10, v2, v11 +; GFX90A-NEXT: s_waitcnt vmcnt(2) +; GFX90A-NEXT: v_add3_u32 v2, v12, v2, v13 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add3_u32 v2, v4, v2, v5 +; GFX90A-NEXT: global_store_dword v[0:1], v2, off +; GFX90A-NEXT: s_endpgm entry: %call = tail call i64 @_Z13get_global_idj(i32 0) %conv = and i64 %call, 255 @@ -307,21 +1225,206 @@ } define amdgpu_kernel void @Offset64(i8 addrspace(1)* %buffer) { -; GCN-LABEL: Offset64: -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8-LABEL: Offset64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s38, -1 +; GFX8-NEXT: s_mov_b32 s39, 0xe80000 +; GFX8-NEXT: s_add_u32 s36, s36, s11 +; GFX8-NEXT: s_addc_u32 s37, s37, 0 +; GFX8-NEXT: s_getpc_b64 s[0:1] +; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: s_mov_b32 s32, 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s35 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s34, v0 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0xf000 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0xf800 +; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[0:1] +; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6] +; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 1, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(2) +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v7 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v6, v8, vcc +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v9, v2 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v10, v5, vcc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc +; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] +; GFX8-NEXT: s_endpgm ; -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX900-LABEL: Offset64: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX900-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX900-NEXT: s_mov_b32 s38, -1 +; GFX900-NEXT: s_mov_b32 s39, 0xe00000 +; GFX900-NEXT: s_add_u32 s36, s36, s11 +; GFX900-NEXT: s_addc_u32 s37, s37, 0 +; GFX900-NEXT: s_getpc_b64 s[0:1] +; GFX900-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX900-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX900-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX900-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX900-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: s_mov_b32 s32, 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX900-NEXT: v_and_b32_e32 v1, 0xff, v0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, s35 +; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, s34, v0 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] +; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 +; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, 0, v0 +; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, 1, v1, vcc +; GFX900-NEXT: global_load_dwordx2 v[5:6], v[0:1], off +; GFX900-NEXT: global_load_dwordx2 v[9:10], v[7:8], off offset:-4096 +; GFX900-NEXT: s_movk_i32 s0, 0xf000 +; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX900-NEXT: global_load_dwordx2 v[11:12], v[7:8], off +; GFX900-NEXT: global_load_dwordx2 v[13:14], v[0:1], off offset:2048 +; GFX900-NEXT: s_waitcnt vmcnt(2) +; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v9, v5 +; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v10, v6, vcc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v13, v0 +; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v14, v1, vcc +; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v11, v0 +; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v12, v1, vcc +; GFX900-NEXT: global_store_dwordx2 v[3:4], v[0:1], off +; GFX900-NEXT: s_endpgm ; -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX10-LABEL: Offset64: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX10-NEXT: s_mov_b32 s38, -1 +; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX10-NEXT: s_add_u32 s36, s36, s11 +; GFX10-NEXT: s_addc_u32 s37, s37, 0 +; GFX10-NEXT: s_getpc_b64 s[0:1] +; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX10-NEXT: s_mov_b32 s32, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 7, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff8000, v2 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX10-NEXT: v_add_co_u32 v2, s0, s34, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s35, 0, s0 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, 0xfffff800 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX10-NEXT: global_load_dwordx2 v[8:9], v[4:5], off offset:-2048 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 1, v1, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[10:11], v[4:5], off +; GFX10-NEXT: global_load_dwordx2 v[12:13], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v8, v6 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v9, v7, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v10, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v11, v1, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v12, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v13, v1, vcc_lo +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_endpgm +; +; GFX90A-LABEL: Offset64: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX90A-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX90A-NEXT: s_mov_b32 s38, -1 +; GFX90A-NEXT: s_mov_b32 s39, 0xe00000 +; GFX90A-NEXT: s_add_u32 s36, s36, s11 +; GFX90A-NEXT: s_addc_u32 s37, s37, 0 +; GFX90A-NEXT: s_getpc_b64 s[0:1] +; GFX90A-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX90A-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX90A-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_mov_b32 s32, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX90A-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s35 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s34, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 3, v[2:3] +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 1, v3, vcc +; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[6:7], off offset:-4096 +; GFX90A-NEXT: s_movk_i32 s0, 0xf000 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[6:7], off +; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[2:3], off offset:2048 +; GFX90A-NEXT: s_waitcnt vmcnt(2) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v8, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v9, v5, vcc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v12, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v13, v3, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v11, v3, vcc +; GFX90A-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX90A-NEXT: s_endpgm entry: %call = tail call i64 @_Z13get_global_idj(i32 0) %conv = and i64 %call, 255 @@ -353,21 +1456,187 @@ ; TODO: Support load4 as anchor instruction. define amdgpu_kernel void @p32Offset64(i8 addrspace(1)* %buffer) { -; GCN-LABEL: p32Offset64: -; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; GFX8-LABEL: p32Offset64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s38, -1 +; GFX8-NEXT: s_mov_b32 s39, 0xe80000 +; GFX8-NEXT: s_add_u32 s36, s36, s11 +; GFX8-NEXT: s_addc_u32 s37, s37, 0 +; GFX8-NEXT: s_getpc_b64 s[0:1] +; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: s_mov_b32 s32, 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s35 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s34, v0 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[1:2] +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: s_mov_b32 s0, 0x7ffff800 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: s_mov_b32 s0, 0x7ffffc00 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: flat_load_dword v6, v[7:8] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x80000000, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(2) +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v5, v2 +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v6, v1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: flat_store_dword v[3:4], v0 +; GFX8-NEXT: s_endpgm +; +; GFX900-LABEL: p32Offset64: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX900-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX900-NEXT: s_mov_b32 s38, -1 +; GFX900-NEXT: s_mov_b32 s39, 0xe00000 +; GFX900-NEXT: s_add_u32 s36, s36, s11 +; GFX900-NEXT: s_addc_u32 s37, s37, 0 +; GFX900-NEXT: s_getpc_b64 s[0:1] +; GFX900-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX900-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX900-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX900-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX900-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: s_mov_b32 s32, 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX900-NEXT: v_and_b32_e32 v1, 0xff, v0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, s35 +; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, s34, v0 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], 2, v[1:2] +; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 +; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, 0x7ffff000, v0 +; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc +; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, 0x80000000, v0 +; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc +; GFX900-NEXT: global_load_dword v2, v[0:1], off +; GFX900-NEXT: global_load_dword v9, v[5:6], off offset:2048 +; GFX900-NEXT: global_load_dword v10, v[5:6], off offset:3072 +; GFX900-NEXT: global_load_dword v11, v[7:8], off +; GFX900-NEXT: s_waitcnt vmcnt(2) +; GFX900-NEXT: v_add_u32_e32 v0, v9, v2 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add3_u32 v0, v10, v0, v11 +; GFX900-NEXT: global_store_dword v[3:4], v0, off +; GFX900-NEXT: s_endpgm ; -; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072 -; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX10-LABEL: p32Offset64: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX10-NEXT: s_mov_b32 s38, -1 +; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX10-NEXT: s_add_u32 s36, s36, s11 +; GFX10-NEXT: s_addc_u32 s37, s37, 0 +; GFX10-NEXT: s_getpc_b64 s[0:1] +; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX10-NEXT: s_mov_b32 s32, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 7, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff8000, v2 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX10-NEXT: v_add_co_u32 v2, s0, s34, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s35, 0, s0 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, 0x80000000 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: global_load_dword v6, v[0:1], off +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ffff800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: global_load_dword v7, v[4:5], off offset:-2048 +; GFX10-NEXT: global_load_dword v8, v[4:5], off +; GFX10-NEXT: global_load_dword v9, v[0:1], off offset:1024 +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: v_add_nc_u32_e32 v0, v7, v6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add3_u32 v0, v9, v0, v8 +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_endpgm ; -; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 +; GFX90A-LABEL: p32Offset64: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX90A-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX90A-NEXT: s_mov_b32 s38, -1 +; GFX90A-NEXT: s_mov_b32 s39, 0xe00000 +; GFX90A-NEXT: s_add_u32 s36, s36, s11 +; GFX90A-NEXT: s_addc_u32 s37, s37, 0 +; GFX90A-NEXT: s_getpc_b64 s[0:1] +; GFX90A-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX90A-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX90A-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_mov_b32 s32, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX90A-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s35 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s34, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7ffff000, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v3, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x80000000, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v3, vcc +; GFX90A-NEXT: global_load_dword v8, v[2:3], off +; GFX90A-NEXT: global_load_dword v9, v[4:5], off offset:2048 +; GFX90A-NEXT: global_load_dword v10, v[4:5], off offset:3072 +; GFX90A-NEXT: global_load_dword v11, v[6:7], off +; GFX90A-NEXT: s_waitcnt vmcnt(2) +; GFX90A-NEXT: v_add_u32_e32 v2, v9, v8 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add3_u32 v2, v10, v2, v11 +; GFX90A-NEXT: global_store_dword v[0:1], v2, off +; GFX90A-NEXT: s_endpgm entry: %call = tail call i64 @_Z13get_global_idj(i32 0) %conv = and i64 %call, 255 @@ -398,27 +1667,185 @@ } define amdgpu_kernel void @DiffBase(i8 addrspace(1)* %buffer1, -; GCN-LABEL: DiffBase: -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8-LABEL: DiffBase: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s42, -1 +; GFX8-NEXT: s_mov_b32 s43, 0xe80000 +; GFX8-NEXT: s_add_u32 s40, s40, s11 +; GFX8-NEXT: s_addc_u32 s41, s41, 0 +; GFX8-NEXT: s_getpc_b64 s[0:1] +; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX8-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: s_mov_b32 s32, 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff8000, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s36, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s39 +; GFX8-NEXT: v_add_u32_e32 v12, vcc, s38, v2 +; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x1000, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x1800, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x2000, v0 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x2800, v12 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[4:5] +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7] +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x3000, v12 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v13, vcc +; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[8:9] +; GFX8-NEXT: flat_load_dwordx2 v[10:11], v[10:11] +; GFX8-NEXT: v_add_u32_e32 v12, vcc, 0x3800, v12 +; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; GFX8-NEXT: flat_load_dwordx2 v[12:13], v[12:13] +; GFX8-NEXT: s_waitcnt vmcnt(4) +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; GFX8-NEXT: s_waitcnt vmcnt(3) +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v7, v3, vcc +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v10, v8 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v11, v9, vcc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v12, v4 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v13, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_endpgm ; -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9-LABEL: DiffBase: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s42, -1 +; GFX9-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-NEXT: s_add_u32 s40, s40, s11 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff8000, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s37 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s36, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s39 +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, s38, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0x1000, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX9-NEXT: s_movk_i32 s0, 0x2000 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[8:9], v[2:3], off offset:2048 +; GFX9-NEXT: global_load_dwordx2 v[10:11], v[4:5], off +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v13, vcc +; GFX9-NEXT: s_movk_i32 s0, 0x3000 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v13, vcc +; GFX9-NEXT: global_load_dwordx2 v[12:13], v[2:3], off offset:2048 +; GFX9-NEXT: global_load_dwordx2 v[14:15], v[4:5], off +; GFX9-NEXT: global_load_dwordx2 v[16:17], v[4:5], off offset:2048 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v9, v7, vcc +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v11, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v14, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v15, v13, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v16, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v17, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX9-NEXT: s_endpgm ; -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX10-LABEL: DiffBase: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX10-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX10-NEXT: s_mov_b32 s42, -1 +; GFX10-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX10-NEXT: s_add_u32 s40, s40, s11 +; GFX10-NEXT: s_addc_u32 s41, s41, 0 +; GFX10-NEXT: s_getpc_b64 s[0:1] +; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 +; GFX10-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX10-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX10-NEXT: s_mov_b32 s32, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff8000, v0 +; GFX10-NEXT: v_add_co_u32 v0, s0, s36, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s37, 0, s0 +; GFX10-NEXT: v_add_co_u32 v14, s0, s38, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v15, s0, s39, 0, s0 +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, 0x1800 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v14, 0x3000 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v15, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[6:7], v[2:3], off offset:-2048 +; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0x2000, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[10:11], v[4:5], off offset:-2048 +; GFX10-NEXT: global_load_dwordx2 v[12:13], v[4:5], off +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x3800, v14 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v15, vcc_lo +; GFX10-NEXT: global_load_dwordx2 v[14:15], v[2:3], off +; GFX10-NEXT: global_load_dwordx2 v[16:17], v[4:5], off +; GFX10-NEXT: s_waitcnt vmcnt(4) +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v8, v6 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v7, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v12, v10 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v13, v11, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v14, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v15, v3, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v16, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v17, v5, vcc_lo +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX10-NEXT: s_endpgm i8 addrspace(1)* %buffer2) { entry: %call = tail call i64 @_Z13get_global_idj(i32 0) @@ -458,33 +1885,300 @@ } define amdgpu_kernel void @ReverseOrder(i8 addrspace(1)* %buffer) { -; GCN-LABEL: ReverseOrder: -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8-LABEL: ReverseOrder: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s38, -1 +; GFX8-NEXT: s_mov_b32 s39, 0xe80000 +; GFX8-NEXT: s_add_u32 s36, s36, s11 +; GFX8-NEXT: s_addc_u32 s37, s37, 0 +; GFX8-NEXT: s_getpc_b64 s[0:1] +; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: s_mov_b32 s32, 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s35 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s34, v0 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x3800 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x3000 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x2800 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[11:12], v[0:1] +; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6] +; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[7:8] +; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10] +; GFX8-NEXT: s_movk_i32 s0, 0x2000 +; GFX8-NEXT: v_add_u32_e32 v13, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x1800 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[13:14] +; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[15:16] +; GFX8-NEXT: s_movk_i32 s0, 0x1000 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[17:18] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(6) +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v11 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v6, v12, vcc +; GFX8-NEXT: s_waitcnt vmcnt(5) +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v8, v5, vcc +; GFX8-NEXT: s_waitcnt vmcnt(4) +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v9, v2 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v10, v5, vcc +; GFX8-NEXT: s_waitcnt vmcnt(3) +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v13, v2 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v14, v5, vcc +; GFX8-NEXT: s_waitcnt vmcnt(2) +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v15, v2 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v16, v5, vcc +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v17, v2 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v18, v5, vcc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc +; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] +; GFX8-NEXT: s_endpgm ; -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX900-LABEL: ReverseOrder: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX900-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX900-NEXT: s_mov_b32 s38, -1 +; GFX900-NEXT: s_mov_b32 s39, 0xe00000 +; GFX900-NEXT: s_add_u32 s36, s36, s11 +; GFX900-NEXT: s_addc_u32 s37, s37, 0 +; GFX900-NEXT: s_getpc_b64 s[0:1] +; GFX900-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX900-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX900-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX900-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX900-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: s_mov_b32 s32, 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX900-NEXT: v_and_b32_e32 v1, 0xff, v0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, s35 +; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, s34, v0 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] +; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 +; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX900-NEXT: s_movk_i32 s0, 0x3000 +; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, s0, v0 +; GFX900-NEXT: global_load_dwordx2 v[5:6], v[0:1], off +; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc +; GFX900-NEXT: global_load_dwordx2 v[9:10], v[7:8], off offset:2048 +; GFX900-NEXT: global_load_dwordx2 v[11:12], v[7:8], off +; GFX900-NEXT: s_movk_i32 s0, 0x2000 +; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, s0, v0 +; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc +; GFX900-NEXT: global_load_dwordx2 v[13:14], v[7:8], off offset:2048 +; GFX900-NEXT: s_movk_i32 s0, 0x1000 +; GFX900-NEXT: v_add_co_u32_e32 v15, vcc, s0, v0 +; GFX900-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v1, vcc +; GFX900-NEXT: global_load_dwordx2 v[17:18], v[15:16], off +; GFX900-NEXT: global_load_dwordx2 v[19:20], v[7:8], off +; GFX900-NEXT: global_load_dwordx2 v[21:22], v[15:16], off offset:2048 +; GFX900-NEXT: global_load_dwordx2 v[23:24], v[0:1], off offset:2048 +; GFX900-NEXT: s_waitcnt vmcnt(6) +; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v9, v5 +; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v10, v6, vcc +; GFX900-NEXT: s_waitcnt vmcnt(5) +; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v11, v0 +; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v12, v1, vcc +; GFX900-NEXT: s_waitcnt vmcnt(4) +; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v13, v0 +; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v14, v1, vcc +; GFX900-NEXT: s_waitcnt vmcnt(2) +; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v19, v0 +; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v20, v1, vcc +; GFX900-NEXT: s_waitcnt vmcnt(1) +; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v21, v0 +; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v22, v1, vcc +; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v17, v0 +; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v18, v1, vcc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v23, v0 +; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v24, v1, vcc +; GFX900-NEXT: global_store_dwordx2 v[3:4], v[0:1], off +; GFX900-NEXT: s_endpgm ; -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX10-LABEL: ReverseOrder: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX10-NEXT: s_mov_b32 s38, -1 +; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX10-NEXT: s_add_u32 s36, s36, s11 +; GFX10-NEXT: s_addc_u32 s37, s37, 0 +; GFX10-NEXT: s_getpc_b64 s[0:1] +; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX10-NEXT: s_mov_b32 s32, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 7, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff8000, v2 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX10-NEXT: v_add_co_u32 v2, s0, s34, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s35, 0, s0 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x3800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, 0x3000, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[8:9], v[0:1], off +; GFX10-NEXT: global_load_dwordx2 v[10:11], v[4:5], off +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x2800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v12, vcc_lo, 0x2000, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v14, vcc_lo, 0x1800, v0 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[6:7], v[6:7], off +; GFX10-NEXT: global_load_dwordx2 v[12:13], v[12:13], off +; GFX10-NEXT: v_add_co_ci_u32_e32 v15, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v16, vcc_lo, 0x1000, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[14:15], v[14:15], off +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[4:5], off +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[18:19], v[16:17], off +; GFX10-NEXT: global_load_dwordx2 v[20:21], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(6) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v10, v8 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v11, v9, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(5) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v6, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v1, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v4, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v12, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v13, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v14, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v15, v1, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v18, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v19, v1, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v20, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v21, v1, vcc_lo +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_endpgm +; +; GFX90A-LABEL: ReverseOrder: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX90A-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX90A-NEXT: s_mov_b32 s38, -1 +; GFX90A-NEXT: s_mov_b32 s39, 0xe00000 +; GFX90A-NEXT: s_add_u32 s36, s36, s11 +; GFX90A-NEXT: s_addc_u32 s37, s37, 0 +; GFX90A-NEXT: s_getpc_b64 s[0:1] +; GFX90A-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX90A-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX90A-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_mov_b32 s32, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX90A-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s35 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s34, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 3, v[2:3] +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX90A-NEXT: s_movk_i32 s0, 0x3000 +; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, s0, v2 +; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v3, vcc +; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[6:7], off offset:2048 +; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[6:7], off +; GFX90A-NEXT: s_movk_i32 s0, 0x2000 +; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, s0, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v3, vcc +; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[6:7], off offset:2048 +; GFX90A-NEXT: s_movk_i32 s0, 0x1000 +; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, s0, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v3, vcc +; GFX90A-NEXT: global_load_dwordx2 v[16:17], v[14:15], off +; GFX90A-NEXT: global_load_dwordx2 v[18:19], v[6:7], off +; GFX90A-NEXT: global_load_dwordx2 v[20:21], v[14:15], off offset:2048 +; GFX90A-NEXT: global_load_dwordx2 v[22:23], v[2:3], off offset:2048 +; GFX90A-NEXT: s_waitcnt vmcnt(6) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v8, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v9, v5, vcc +; GFX90A-NEXT: s_waitcnt vmcnt(5) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v11, v3, vcc +; GFX90A-NEXT: s_waitcnt vmcnt(4) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v12, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v13, v3, vcc +; GFX90A-NEXT: s_waitcnt vmcnt(2) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v18, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v19, v3, vcc +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v20, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v21, v3, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v16, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v17, v3, vcc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v22, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v23, v3, vcc +; GFX90A-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX90A-NEXT: s_endpgm entry: %call = tail call i64 @_Z13get_global_idj(i32 0) %conv = and i64 %call, 255 @@ -529,15 +2223,172 @@ } define hidden amdgpu_kernel void @negativeoffset(i8 addrspace(1)* nocapture %buffer) { -; GCN-LABEL: negativeoffset: -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8-LABEL: negativeoffset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s38, -1 +; GFX8-NEXT: s_mov_b32 s39, 0xe80000 +; GFX8-NEXT: s_add_u32 s36, s36, s11 +; GFX8-NEXT: s_addc_u32 s37, s37, 0 +; GFX8-NEXT: s_getpc_b64 s[0:1] +; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: s_mov_b32 s32, 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s35 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s34, v0 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v4, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x800 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0, v2 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, -1, v6, vcc +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v6, v1, vcc +; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX900-LABEL: negativeoffset: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX900-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX900-NEXT: s_mov_b32 s38, -1 +; GFX900-NEXT: s_mov_b32 s39, 0xe00000 +; GFX900-NEXT: s_add_u32 s36, s36, s11 +; GFX900-NEXT: s_addc_u32 s37, s37, 0 +; GFX900-NEXT: s_getpc_b64 s[0:1] +; GFX900-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX900-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX900-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX900-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX900-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: s_mov_b32 s32, 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX900-NEXT: v_and_b32_e32 v1, 0xff, v0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, s35 +; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, s34, v0 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] +; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v3, v0 +; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, v4, v1, vcc +; GFX900-NEXT: s_movk_i32 s0, 0x1000 +; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 +; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc +; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, 0, v2 +; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v6, vcc +; GFX900-NEXT: global_load_dwordx2 v[7:8], v[0:1], off offset:-2048 +; GFX900-NEXT: global_load_dwordx2 v[9:10], v[5:6], off +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v9, v7 +; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v10, v8, vcc +; GFX900-NEXT: global_store_dwordx2 v[3:4], v[0:1], off +; GFX900-NEXT: s_endpgm ; -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX10-LABEL: negativeoffset: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX10-NEXT: s_mov_b32 s38, -1 +; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX10-NEXT: s_add_u32 s36, s36, s11 +; GFX10-NEXT: s_addc_u32 s37, s37, 0 +; GFX10-NEXT: s_getpc_b64 s[0:1] +; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX10-NEXT: s_mov_b32 s32, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 7, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff8000, v2 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX10-NEXT: v_add_co_u32 v2, s0, s34, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s35, 0, s0 +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v5, vcc_lo +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v5, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX10-NEXT: global_load_dwordx2 v[8:9], v[4:5], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v8, v6 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v9, v7, vcc_lo +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_endpgm ; -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX90A-LABEL: negativeoffset: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX90A-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX90A-NEXT: s_mov_b32 s38, -1 +; GFX90A-NEXT: s_mov_b32 s39, 0xe00000 +; GFX90A-NEXT: s_add_u32 s36, s36, s11 +; GFX90A-NEXT: s_addc_u32 s37, s37, 0 +; GFX90A-NEXT: s_getpc_b64 s[0:1] +; GFX90A-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX90A-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX90A-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_mov_b32 s32, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX90A-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s35 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s34, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 3, v[2:3] +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc +; GFX90A-NEXT: s_movk_i32 s0, 0x1000 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v5, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v5, vcc +; GFX90A-NEXT: global_load_dwordx2 v[6:7], v[2:3], off offset:-2048 +; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[4:5], off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v8, v6 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v9, v7, vcc +; GFX90A-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX90A-NEXT: s_endpgm entry: %call = tail call i64 @_Z13get_global_idj(i32 0) #2 %conv = and i64 %call, 255 diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll --- a/llvm/test/CodeGen/AMDGPU/saddo.ll +++ b/llvm/test/CodeGen/AMDGPU/saddo.ll @@ -152,8 +152,8 @@ ; ; GFX9-LABEL: s_saddo_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll --- a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll +++ b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll @@ -57,9 +57,8 @@ ; SI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x2ee0 ; GCN: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}} ; GCN: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}} -; SI: s_mov_b32 -; SI: s_nop 1 -; SI: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, [[OFFSET]] +; SI-DAG: s_mov_b32 +; SI-DAG: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, [[OFFSET]] ; CI: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0xbb8 ; GCN: v_mov_b32_e32 [[V_OUT:v[0-9]+]], [[OUT]] @@ -171,10 +170,10 @@ ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x8: ; GCN-NOHSA-DAG: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x9a40{{$}} -; CI-NOHSA-DAG: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}} ; CI-NOHSA-NOT: v_add ; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16 -; CI-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}} +; CI-NOHSA-DAG: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}} +; CI-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}} ; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}} ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} @@ -203,9 +202,9 @@ ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x16: ; SI: s_mov_b32 {{s[0-9]+}}, 0x13480 -; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:32 -; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:48 -; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16 +; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16 +; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:32 +; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:48 ; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], {{s[0-9]+}} addr64 ; CI-NOHSA-DAG: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x13480{{$}} ; CI-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -26,22 +26,22 @@ ; ; VI-LABEL: scalar_to_vector_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm %tmp1 = load i32, i32 addrspace(1)* %in, align 4 %bc = bitcast i32 %tmp1 to <2 x i16> @@ -73,22 +73,22 @@ ; ; VI-LABEL: scalar_to_vector_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm %tmp1 = load float, float addrspace(1)* %in, align 4 %bc = bitcast float %tmp1 to <2 x i16> @@ -234,13 +234,13 @@ ; ; VI-LABEL: scalar_to_vector_test6: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %newvec0 = insertelement <4 x i8> undef, i8 %val, i32 0 %bc = bitcast <4 x i8> %newvec0 to <2 x half> diff --git a/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir b/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir --- a/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir @@ -19,51 +19,55 @@ body: | ; CHECK-LABEL: name: only_dbg_value_sched_region ; CHECK: bb.0: - ; CHECK: successors: %bb.1(0x80000000) - ; CHECK: liveins: $vgpr0 - ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; CHECK: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF]], 0, 0, implicit $exec - ; CHECK: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 8, 0, implicit $exec - ; CHECK: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_]] - ; CHECK: undef %6.sub0:vreg_64 = V_ADD_F32_e32 [[DEF]].sub0, [[COPY1]].sub0, implicit $mode, implicit $exec - ; CHECK: dead undef %6.sub1:vreg_64 = V_ADD_F32_e32 [[DEF]].sub1, [[COPY1]].sub0, implicit $mode, implicit $exec - ; CHECK: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY1]], 0, 0, implicit $exec - ; CHECK: undef %4.sub0:vreg_64 = V_MOV_B32_e32 111, implicit $exec - ; CHECK: [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; CHECK: [[DEF2:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; CHECK: [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; CHECK: undef %11.sub1:vreg_64 = IMPLICIT_DEF - ; CHECK: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK: [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK: [[DEF6:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; CHECK: undef %19.sub0:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD1]], [[GLOBAL_LOAD_DWORDX2_]].sub0, implicit $mode, implicit $exec - ; CHECK: [[DEF7:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; CHECK: %19.sub1:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD]], implicit $mode, implicit $exec - ; CHECK: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK: %4.sub1:vreg_64 = V_ADD_U32_e32 [[COPY]], [[COPY]], implicit $exec - ; CHECK: GLOBAL_STORE_DWORDX2 %19, %4, 32, 0, implicit $exec - ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK: %11.sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF2]], 0, 0, implicit $exec - ; CHECK: [[DEF1]].sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF3]], 0, 0, implicit $exec - ; CHECK: dead %20:vgpr_32 = GLOBAL_LOAD_DWORD %11, 0, 0, implicit $exec - ; CHECK: dead %21:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF6]], 0, 0, implicit $exec - ; CHECK: [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64_e64 2, [[DEF1]], implicit $exec - ; CHECK: dead %22:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF7]], 0, 0, implicit $exec - ; CHECK: S_NOP 0, implicit [[DEF5]], implicit [[V_LSHLREV_B64_e64_]].sub0, implicit [[DEF4]], implicit [[V_MOV_B32_e32_]] - ; CHECK: GLOBAL_STORE_DWORD [[DEF7]], [[V_MOV_B32_e32_1]], 0, 0, implicit $exec - ; CHECK: bb.1: - ; CHECK: successors: %bb.2(0x80000000) - ; CHECK: S_SETREG_IMM32_B32 0, 1, implicit-def $mode, implicit $mode - ; CHECK: DBG_VALUE - ; CHECK: DBG_VALUE - ; CHECK: DBG_VALUE - ; CHECK: S_SETREG_IMM32_B32 0, 1, implicit-def $mode, implicit $mode - ; CHECK: bb.2: - ; CHECK: S_NOP 0, implicit [[COPY]] - ; CHECK: S_NOP 0, implicit [[DEF8]] - ; CHECK: S_ENDPGM 0 + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF]], 0, 0, implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 8, 0, implicit $exec + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_]] + ; CHECK-NEXT: undef %6.sub0:vreg_64 = V_ADD_F32_e32 [[DEF]].sub0, [[COPY1]].sub0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead undef %6.sub1:vreg_64 = V_ADD_F32_e32 [[DEF]].sub1, [[COPY1]].sub0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY1]], 0, 0, implicit $exec + ; CHECK-NEXT: undef %4.sub0:vreg_64 = V_MOV_B32_e32 111, implicit $exec + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: %4.sub1:vreg_64 = V_ADD_U32_e32 [[COPY]], [[COPY]], implicit $exec + ; CHECK-NEXT: undef %19.sub1:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD]], implicit $mode, implicit $exec + ; CHECK-NEXT: %19.sub0:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD1]], [[GLOBAL_LOAD_DWORDX2_]].sub0, implicit $mode, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 %19, %4, 32, 0, implicit $exec + ; CHECK-NEXT: undef %11.sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF1]], 0, 0, implicit $exec + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2]].sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF3]], 0, 0, implicit $exec + ; CHECK-NEXT: %11.sub1:vreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: dead %20:vgpr_32 = GLOBAL_LOAD_DWORD %11, 0, 0, implicit $exec + ; CHECK-NEXT: dead %21:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF4]], 0, 0, implicit $exec + ; CHECK-NEXT: dead %22:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF5]], 0, 0, implicit $exec + ; CHECK-NEXT: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64_e64 2, [[DEF2]], implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: S_NOP 0, implicit [[DEF7]], implicit [[V_LSHLREV_B64_e64_]].sub0, implicit [[DEF6]], implicit [[V_MOV_B32_e32_]] + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[DEF5]], [[V_MOV_B32_e32_1]], 0, 0, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_SETREG_IMM32_B32 0, 1, implicit-def $mode, implicit $mode + ; CHECK-NEXT: DBG_VALUE + ; CHECK-NEXT: DBG_VALUE + ; CHECK-NEXT: DBG_VALUE + ; CHECK-NEXT: S_SETREG_IMM32_B32 0, 1, implicit-def $mode, implicit $mode + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_NOP 0, implicit [[COPY]] + ; CHECK-NEXT: S_NOP 0, implicit [[DEF8]] + ; CHECK-NEXT: S_ENDPGM 0 bb.0: liveins: $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/schedule-ilp.mir b/llvm/test/CodeGen/AMDGPU/schedule-ilp.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/schedule-ilp.mir @@ -0,0 +1,63 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-machineinstrs -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s + +--- | + define amdgpu_kernel void @schedule_ilp(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void } + + !0 = distinct !{!0} + !1 = !{!1, !0} +... + +# Check that the load that defines %4 and its user are scheduled as far apart +# as possible. In this example, the generic machine scheduler's default latency +# heuristic will not be triggered early enough. Verify that our backend handles +# this properly. + +--- +name: schedule_ilp +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: $sgpr32 +stack: + - { id: 0, type: default, offset: 0, size: 4, alignment: 4 } + - { id: 1, type: default, offset: 0, size: 4, alignment: 4 } +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + ; CHECK-LABEL: name: schedule_ilp + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: $exec = S_OR_B64 $exec, [[DEF]], implicit-def $scc + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF3]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF3]], 4, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD2:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF3]], 8, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN [[DEF1]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN [[DEF2]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec + ; CHECK-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 -1, [[GLOBAL_LOAD_DWORD]], implicit $exec + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], [[V_CMP_NE_U32_e64_]], implicit-def dead $scc + ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_AND_B64_]] + ; CHECK-NEXT: S_ENDPGM 0, implicit [[GLOBAL_LOAD_DWORD1]], implicit [[GLOBAL_LOAD_DWORD2]], implicit [[V_MOV_B]] + %0:sreg_64 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = IMPLICIT_DEF + %3:vreg_64 = IMPLICIT_DEF + $exec = S_OR_B64 $exec, %0, implicit-def $scc + BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0) + BUFFER_STORE_DWORD_OFFEN %2:vgpr_32, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0) + %4:vgpr_32 = GLOBAL_LOAD_DWORD %3, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0) + %5:vgpr_32 = GLOBAL_LOAD_DWORD %3, 4, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0) + %6:vgpr_32 = GLOBAL_LOAD_DWORD %3, 8, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0) + %7:sreg_64 = V_CMP_NE_U32_e64 -1, %4, implicit $exec + %8:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec + %9:sreg_64 = COPY $exec, implicit-def $exec + %10:sreg_64 = S_AND_B64 %9:sreg_64, %7:sreg_64, implicit-def dead $scc + $exec = S_MOV_B64_term %10 + S_ENDPGM 0, implicit %5, implicit %6, implicit %8 +... diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -106,17 +106,17 @@ ; ; GFX9-LABEL: sdiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 @@ -146,7 +146,7 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: sdiv_i32: @@ -200,65 +200,65 @@ define amdgpu_kernel void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { ; GCN-LABEL: sdiv_i32_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s10, s2 -; GCN-NEXT: s_mov_b32 s11, s3 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s10, s6 +; GCN-NEXT: s_mov_b32 s11, s7 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s6 -; GCN-NEXT: s_mov_b32 s9, s7 +; GCN-NEXT: s_mov_b32 s8, s2 +; GCN-NEXT: s_mov_b32 s9, s3 ; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 30, v1 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 2, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; TONGA-LABEL: sdiv_i32_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; TONGA-NEXT: s_mov_b32 s3, 0xf000 -; TONGA-NEXT: s_mov_b32 s2, -1 -; TONGA-NEXT: s_mov_b32 s10, s2 -; TONGA-NEXT: s_mov_b32 s11, s3 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; TONGA-NEXT: s_mov_b32 s7, 0xf000 +; TONGA-NEXT: s_mov_b32 s6, -1 +; TONGA-NEXT: s_mov_b32 s10, s6 +; TONGA-NEXT: s_mov_b32 s11, s7 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: s_mov_b32 s8, s6 -; TONGA-NEXT: s_mov_b32 s9, s7 +; TONGA-NEXT: s_mov_b32 s8, s2 +; TONGA-NEXT: s_mov_b32 s9, s3 ; TONGA-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; TONGA-NEXT: s_mov_b32 s0, s4 -; TONGA-NEXT: s_mov_b32 s1, s5 +; TONGA-NEXT: s_mov_b32 s4, s0 +; TONGA-NEXT: s_mov_b32 s5, s1 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; TONGA-NEXT: v_lshrrev_b32_e32 v1, 30, v1 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 2, v0 -; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; TONGA-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_i32_4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s6, s2 -; GFX9-NEXT: s_mov_b32 s7, s3 -; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 30, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 2, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: sdiv_i32_4: @@ -294,71 +294,71 @@ define amdgpu_kernel void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { ; GCN-LABEL: slow_sdiv_i32_3435: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s10, s2 -; GCN-NEXT: s_mov_b32 s11, s3 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s10, s6 +; GCN-NEXT: s_mov_b32 s11, s7 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s6 -; GCN-NEXT: s_mov_b32 s9, s7 +; GCN-NEXT: s_mov_b32 s8, s2 +; GCN-NEXT: s_mov_b32 s9, s3 ; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GCN-NEXT: s_mov_b32 s0, 0x98a1930b -; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_mov_b32 s2, 0x98a1930b +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_hi_i32 v1, v0, s0 -; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: v_mul_hi_i32 v1, v0, s2 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 11, v0 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; TONGA-LABEL: slow_sdiv_i32_3435: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; TONGA-NEXT: s_mov_b32 s3, 0xf000 -; TONGA-NEXT: s_mov_b32 s2, -1 -; TONGA-NEXT: s_mov_b32 s10, s2 -; TONGA-NEXT: s_mov_b32 s11, s3 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; TONGA-NEXT: s_mov_b32 s7, 0xf000 +; TONGA-NEXT: s_mov_b32 s6, -1 +; TONGA-NEXT: s_mov_b32 s10, s6 +; TONGA-NEXT: s_mov_b32 s11, s7 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: s_mov_b32 s8, s6 -; TONGA-NEXT: s_mov_b32 s9, s7 +; TONGA-NEXT: s_mov_b32 s8, s2 +; TONGA-NEXT: s_mov_b32 s9, s3 ; TONGA-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; TONGA-NEXT: s_mov_b32 s0, 0x98a1930b -; TONGA-NEXT: s_mov_b32 s1, s5 +; TONGA-NEXT: s_mov_b32 s2, 0x98a1930b +; TONGA-NEXT: s_mov_b32 s4, s0 +; TONGA-NEXT: s_mov_b32 s5, s1 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_mul_hi_i32 v1, v0, s0 -; TONGA-NEXT: s_mov_b32 s0, s4 +; TONGA-NEXT: v_mul_hi_i32 v1, v0, s2 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; TONGA-NEXT: v_lshrrev_b32_e32 v1, 31, v0 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 11, v0 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v1, v0 -; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; TONGA-NEXT: s_endpgm ; ; GFX9-LABEL: slow_sdiv_i32_3435: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, 0x98a1930b -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s2, 0x98a1930b +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mul_hi_i32 v1, v0, s0 -; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: v_mul_hi_i32 v1, v0, s2 ; GFX9-NEXT: v_add_u32_e32 v0, v1, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 31, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 11, v0 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: slow_sdiv_i32_3435: @@ -689,17 +689,17 @@ define amdgpu_kernel void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { ; GCN-LABEL: sdiv_v2i32_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s10, s2 -; GCN-NEXT: s_mov_b32 s11, s3 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s10, s6 +; GCN-NEXT: s_mov_b32 s11, s7 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s6 -; GCN-NEXT: s_mov_b32 s9, s7 +; GCN-NEXT: s_mov_b32 s8, s2 +; GCN-NEXT: s_mov_b32 s9, s3 ; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v1 @@ -709,22 +709,22 @@ ; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 2, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 2, v1 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; TONGA-LABEL: sdiv_v2i32_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; TONGA-NEXT: s_mov_b32 s3, 0xf000 -; TONGA-NEXT: s_mov_b32 s2, -1 -; TONGA-NEXT: s_mov_b32 s10, s2 -; TONGA-NEXT: s_mov_b32 s11, s3 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; TONGA-NEXT: s_mov_b32 s7, 0xf000 +; TONGA-NEXT: s_mov_b32 s6, -1 +; TONGA-NEXT: s_mov_b32 s10, s6 +; TONGA-NEXT: s_mov_b32 s11, s7 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: s_mov_b32 s8, s6 -; TONGA-NEXT: s_mov_b32 s9, s7 +; TONGA-NEXT: s_mov_b32 s8, s2 +; TONGA-NEXT: s_mov_b32 s9, s3 ; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; TONGA-NEXT: s_mov_b32 s0, s4 -; TONGA-NEXT: s_mov_b32 s1, s5 +; TONGA-NEXT: s_mov_b32 s4, s0 +; TONGA-NEXT: s_mov_b32 s5, s1 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_ashrrev_i32_e32 v2, 31, v0 ; TONGA-NEXT: v_ashrrev_i32_e32 v3, 31, v1 @@ -734,22 +734,22 @@ ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v3, v1 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 2, v0 ; TONGA-NEXT: v_ashrrev_i32_e32 v1, 2, v1 -; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; TONGA-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v2i32_4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v1 @@ -759,7 +759,7 @@ ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 2, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 2, v1 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: sdiv_v2i32_4: @@ -1319,17 +1319,17 @@ define amdgpu_kernel void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { ; GCN-LABEL: sdiv_v4i32_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s10, s2 -; GCN-NEXT: s_mov_b32 s11, s3 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s10, s6 +; GCN-NEXT: s_mov_b32 s11, s7 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s6 -; GCN-NEXT: s_mov_b32 s9, s7 +; GCN-NEXT: s_mov_b32 s8, s2 +; GCN-NEXT: s_mov_b32 s9, s3 ; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v1 @@ -1347,22 +1347,22 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v1, 2, v1 ; GCN-NEXT: v_ashrrev_i32_e32 v2, 2, v2 ; GCN-NEXT: v_ashrrev_i32_e32 v3, 2, v3 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; TONGA-LABEL: sdiv_v4i32_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; TONGA-NEXT: s_mov_b32 s3, 0xf000 -; TONGA-NEXT: s_mov_b32 s2, -1 -; TONGA-NEXT: s_mov_b32 s10, s2 -; TONGA-NEXT: s_mov_b32 s11, s3 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; TONGA-NEXT: s_mov_b32 s7, 0xf000 +; TONGA-NEXT: s_mov_b32 s6, -1 +; TONGA-NEXT: s_mov_b32 s10, s6 +; TONGA-NEXT: s_mov_b32 s11, s7 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: s_mov_b32 s8, s6 -; TONGA-NEXT: s_mov_b32 s9, s7 +; TONGA-NEXT: s_mov_b32 s8, s2 +; TONGA-NEXT: s_mov_b32 s9, s3 ; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; TONGA-NEXT: s_mov_b32 s0, s4 -; TONGA-NEXT: s_mov_b32 s1, s5 +; TONGA-NEXT: s_mov_b32 s4, s0 +; TONGA-NEXT: s_mov_b32 s5, s1 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v0 ; TONGA-NEXT: v_ashrrev_i32_e32 v5, 31, v1 @@ -1380,22 +1380,22 @@ ; TONGA-NEXT: v_ashrrev_i32_e32 v1, 2, v1 ; TONGA-NEXT: v_ashrrev_i32_e32 v2, 2, v2 ; TONGA-NEXT: v_ashrrev_i32_e32 v3, 2, v3 -; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; TONGA-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v4i32_4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v1 @@ -1413,7 +1413,7 @@ ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 2, v1 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 2, v2 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 2, v3 -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: sdiv_v4i32_4: @@ -1463,26 +1463,26 @@ define amdgpu_kernel void @v_sdiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { ; GCN-LABEL: v_sdiv_i8: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s10, s2 -; GCN-NEXT: s_mov_b32 s11, s3 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s10, s6 +; GCN-NEXT: s_mov_b32 s11, s7 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s6 -; GCN-NEXT: s_mov_b32 s9, s7 -; GCN-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 -; GCN-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 offset:1 -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_mov_b32 s8, s2 +; GCN-NEXT: s_mov_b32 s9, s3 +; GCN-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:1 +; GCN-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f32_i32_e32 v3, v0 +; GCN-NEXT: v_cvt_f32_i32_e32 v2, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_i32_e32 v2, v1 -; GCN-NEXT: v_xor_b32_e32 v0, v0, v1 +; GCN-NEXT: v_cvt_f32_i32_e32 v3, v1 +; GCN-NEXT: v_xor_b32_e32 v0, v1, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 30, v0 -; GCN-NEXT: v_or_b32_e32 v0, 1, v0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; GCN-NEXT: v_or_b32_e32 v0, 1, v0 ; GCN-NEXT: v_mul_f32_e32 v1, v3, v4 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_mad_f32 v3, -v1, v2, v3 @@ -1491,31 +1491,31 @@ ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; TONGA-LABEL: v_sdiv_i8: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; TONGA-NEXT: s_mov_b32 s3, 0xf000 -; TONGA-NEXT: s_mov_b32 s2, -1 -; TONGA-NEXT: s_mov_b32 s10, s2 -; TONGA-NEXT: s_mov_b32 s11, s3 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; TONGA-NEXT: s_mov_b32 s7, 0xf000 +; TONGA-NEXT: s_mov_b32 s6, -1 +; TONGA-NEXT: s_mov_b32 s10, s6 +; TONGA-NEXT: s_mov_b32 s11, s7 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: s_mov_b32 s8, s6 -; TONGA-NEXT: s_mov_b32 s9, s7 -; TONGA-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 -; TONGA-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 offset:1 -; TONGA-NEXT: s_mov_b32 s0, s4 -; TONGA-NEXT: s_mov_b32 s1, s5 +; TONGA-NEXT: s_mov_b32 s8, s2 +; TONGA-NEXT: s_mov_b32 s9, s3 +; TONGA-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:1 +; TONGA-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 +; TONGA-NEXT: s_mov_b32 s4, s0 +; TONGA-NEXT: s_mov_b32 s5, s1 ; TONGA-NEXT: s_waitcnt vmcnt(1) -; TONGA-NEXT: v_cvt_f32_i32_e32 v3, v0 +; TONGA-NEXT: v_cvt_f32_i32_e32 v2, v0 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_cvt_f32_i32_e32 v2, v1 -; TONGA-NEXT: v_xor_b32_e32 v0, v0, v1 +; TONGA-NEXT: v_cvt_f32_i32_e32 v3, v1 +; TONGA-NEXT: v_xor_b32_e32 v0, v1, v0 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 30, v0 -; TONGA-NEXT: v_or_b32_e32 v0, 1, v0 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; TONGA-NEXT: v_or_b32_e32 v0, 1, v0 ; TONGA-NEXT: v_mul_f32_e32 v1, v3, v4 ; TONGA-NEXT: v_trunc_f32_e32 v1, v1 ; TONGA-NEXT: v_mad_f32 v3, -v1, v2, v3 @@ -1524,31 +1524,31 @@ ; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 8 -; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; TONGA-NEXT: s_endpgm ; ; GFX9-LABEL: v_sdiv_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 -; GFX9-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 offset:1 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:1 +; GFX9-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v0 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v1 -; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v1 +; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v0 -; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, v3, v4 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v1 @@ -1557,7 +1557,7 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX9-NEXT: v_add_u32_e32 v0, v4, v0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v_sdiv_i8: @@ -1608,125 +1608,131 @@ define amdgpu_kernel void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) { ; GCN-LABEL: v_sdiv_i23: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s10, s2 -; GCN-NEXT: s_mov_b32 s11, s3 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s10, s6 +; GCN-NEXT: s_mov_b32 s11, s7 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s6 -; GCN-NEXT: s_mov_b32 s9, s7 -; GCN-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GCN-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2 +; GCN-NEXT: s_mov_b32 s8, s2 +; GCN-NEXT: s_mov_b32 s9, s3 +; GCN-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:2 +; GCN-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:6 ; GCN-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4 -; GCN-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:6 -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: buffer_load_ushort v3, off, s[8:11], 0 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_waitcnt vmcnt(2) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: v_bfe_i32 v1, v1, 0, 23 +; GCN-NEXT: v_cvt_f32_i32_e32 v2, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v2, v2, v3 -; GCN-NEXT: v_bfe_i32 v2, v2, 0, 23 -; GCN-NEXT: v_cvt_f32_i32_e32 v3, v2 +; GCN-NEXT: v_or_b32_e32 v0, v3, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 23 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, v0 -; GCN-NEXT: v_xor_b32_e32 v0, v0, v2 -; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v3 +; GCN-NEXT: v_cvt_f32_i32_e32 v3, v0 +; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; GCN-NEXT: v_xor_b32_e32 v0, v0, v1 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 30, v0 ; GCN-NEXT: v_or_b32_e32 v0, 1, v0 -; GCN-NEXT: v_mul_f32_e32 v2, v1, v4 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mad_f32 v1, -v2, v3, v1 -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3| +; GCN-NEXT: v_mul_f32_e32 v1, v3, v4 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_mad_f32 v3, -v1, v2, v3 +; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; TONGA-LABEL: v_sdiv_i23: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; TONGA-NEXT: s_mov_b32 s3, 0xf000 -; TONGA-NEXT: s_mov_b32 s2, -1 -; TONGA-NEXT: s_mov_b32 s10, s2 -; TONGA-NEXT: s_mov_b32 s11, s3 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; TONGA-NEXT: s_mov_b32 s7, 0xf000 +; TONGA-NEXT: s_mov_b32 s6, -1 +; TONGA-NEXT: s_mov_b32 s10, s6 +; TONGA-NEXT: s_mov_b32 s11, s7 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: s_mov_b32 s8, s6 -; TONGA-NEXT: s_mov_b32 s9, s7 -; TONGA-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; TONGA-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2 +; TONGA-NEXT: s_mov_b32 s8, s2 +; TONGA-NEXT: s_mov_b32 s9, s3 +; TONGA-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:2 +; TONGA-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:6 ; TONGA-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4 -; TONGA-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:6 -; TONGA-NEXT: s_mov_b32 s0, s4 -; TONGA-NEXT: s_mov_b32 s1, s5 +; TONGA-NEXT: buffer_load_ushort v3, off, s[8:11], 0 +; TONGA-NEXT: s_mov_b32 s4, s0 +; TONGA-NEXT: s_mov_b32 s5, s1 +; TONGA-NEXT: s_waitcnt vmcnt(3) +; TONGA-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; TONGA-NEXT: s_waitcnt vmcnt(2) ; TONGA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; TONGA-NEXT: v_or_b32_e32 v0, v0, v1 +; TONGA-NEXT: s_waitcnt vmcnt(1) +; TONGA-NEXT: v_or_b32_e32 v1, v2, v1 +; TONGA-NEXT: v_bfe_i32 v1, v1, 0, 23 +; TONGA-NEXT: v_cvt_f32_i32_e32 v2, v1 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; TONGA-NEXT: v_or_b32_e32 v2, v2, v3 -; TONGA-NEXT: v_bfe_i32 v2, v2, 0, 23 -; TONGA-NEXT: v_cvt_f32_i32_e32 v3, v2 +; TONGA-NEXT: v_or_b32_e32 v0, v3, v0 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 23 -; TONGA-NEXT: v_cvt_f32_i32_e32 v1, v0 -; TONGA-NEXT: v_xor_b32_e32 v0, v0, v2 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v4, v3 +; TONGA-NEXT: v_cvt_f32_i32_e32 v3, v0 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; TONGA-NEXT: v_xor_b32_e32 v0, v0, v1 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 30, v0 ; TONGA-NEXT: v_or_b32_e32 v0, 1, v0 -; TONGA-NEXT: v_mul_f32_e32 v2, v1, v4 -; TONGA-NEXT: v_trunc_f32_e32 v2, v2 -; TONGA-NEXT: v_mad_f32 v1, -v2, v3, v1 -; TONGA-NEXT: v_cvt_i32_f32_e32 v2, v2 -; TONGA-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3| +; TONGA-NEXT: v_mul_f32_e32 v1, v3, v4 +; TONGA-NEXT: v_trunc_f32_e32 v1, v1 +; TONGA-NEXT: v_mad_f32 v3, -v1, v2, v3 +; TONGA-NEXT: v_cvt_i32_f32_e32 v1, v1 +; TONGA-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| ; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v2, v0 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 23 -; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; TONGA-NEXT: s_endpgm ; ; GFX9-LABEL: v_sdiv_i23: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX9-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:2 +; GFX9-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:6 ; GFX9-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4 -; GFX9-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:6 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: buffer_load_ushort v3, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 23 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 23 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v2 +; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 23 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v0 -; GFX9-NEXT: v_xor_b32_e32 v0, v0, v2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v3 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v0 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v0 ; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_mul_f32_e32 v2, v1, v4 -; GFX9-NEXT: v_trunc_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v2 -; GFX9-NEXT: v_mad_f32 v1, -v2, v3, v1 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3| +; GFX9-NEXT: v_mul_f32_e32 v1, v3, v4 +; GFX9-NEXT: v_trunc_f32_e32 v1, v1 +; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v1 +; GFX9-NEXT: v_mad_f32 v1, -v1, v2, v3 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX9-NEXT: v_add_u32_e32 v0, v4, v0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 23 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v_sdiv_i23: @@ -1791,116 +1797,125 @@ define amdgpu_kernel void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) { ; GCN-LABEL: v_sdiv_i24: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s10, s2 -; GCN-NEXT: s_mov_b32 s11, s3 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s10, s6 +; GCN-NEXT: s_mov_b32 s11, s7 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s6 -; GCN-NEXT: s_mov_b32 s9, s7 -; GCN-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GCN-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 offset:2 -; GCN-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4 -; GCN-NEXT: buffer_load_sbyte v3, off, s[8:11], 0 offset:6 -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_mov_b32 s8, s2 +; GCN-NEXT: s_mov_b32 s9, s3 +; GCN-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:6 +; GCN-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4 +; GCN-NEXT: buffer_load_sbyte v2, off, s[8:11], 0 offset:2 +; GCN-NEXT: buffer_load_ushort v3, off, s[8:11], 0 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_or_b32_e32 v1, v1, v4 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GCN-NEXT: v_or_b32_e32 v2, v2, v4 -; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v4 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GCN-NEXT: v_xor_b32_e32 v1, v1, v3 -; GCN-NEXT: v_ashrrev_i32_e32 v1, 30, v1 -; GCN-NEXT: v_or_b32_e32 v1, 1, v1 -; GCN-NEXT: v_mul_f32_e32 v3, v0, v4 -; GCN-NEXT: v_trunc_f32_e32 v3, v3 -; GCN-NEXT: v_mad_f32 v0, -v3, v2, v0 -; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v2| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; GCN-NEXT: v_or_b32_e32 v3, v3, v4 +; GCN-NEXT: v_cvt_f32_i32_e32 v3, v3 +; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v1 +; GCN-NEXT: v_xor_b32_e32 v0, v2, v0 +; GCN-NEXT: v_ashrrev_i32_e32 v0, 30, v0 +; GCN-NEXT: v_or_b32_e32 v0, 1, v0 +; GCN-NEXT: v_mul_f32_e32 v2, v3, v4 +; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_mad_f32 v3, -v2, v1, v3 +; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; TONGA-LABEL: v_sdiv_i24: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; TONGA-NEXT: s_mov_b32 s3, 0xf000 -; TONGA-NEXT: s_mov_b32 s2, -1 -; TONGA-NEXT: s_mov_b32 s10, s2 -; TONGA-NEXT: s_mov_b32 s11, s3 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; TONGA-NEXT: s_mov_b32 s7, 0xf000 +; TONGA-NEXT: s_mov_b32 s6, -1 +; TONGA-NEXT: s_mov_b32 s10, s6 +; TONGA-NEXT: s_mov_b32 s11, s7 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: s_mov_b32 s8, s6 -; TONGA-NEXT: s_mov_b32 s9, s7 -; TONGA-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; TONGA-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 offset:2 -; TONGA-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4 -; TONGA-NEXT: buffer_load_sbyte v3, off, s[8:11], 0 offset:6 -; TONGA-NEXT: s_mov_b32 s0, s4 -; TONGA-NEXT: s_mov_b32 s1, s5 +; TONGA-NEXT: s_mov_b32 s8, s2 +; TONGA-NEXT: s_mov_b32 s9, s3 +; TONGA-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:6 +; TONGA-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4 +; TONGA-NEXT: buffer_load_sbyte v2, off, s[8:11], 0 offset:2 +; TONGA-NEXT: buffer_load_ushort v3, off, s[8:11], 0 +; TONGA-NEXT: s_mov_b32 s4, s0 +; TONGA-NEXT: s_mov_b32 s5, s1 +; TONGA-NEXT: s_waitcnt vmcnt(3) +; TONGA-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; TONGA-NEXT: s_waitcnt vmcnt(2) +; TONGA-NEXT: v_or_b32_e32 v1, v1, v4 +; TONGA-NEXT: v_cvt_f32_i32_e32 v1, v1 +; TONGA-NEXT: s_waitcnt vmcnt(1) +; TONGA-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; TONGA-NEXT: v_or_b32_e32 v2, v2, v4 -; TONGA-NEXT: v_cvt_f32_i32_e32 v2, v2 -; TONGA-NEXT: v_lshlrev_b32_e32 v4, 16, v1 -; TONGA-NEXT: v_or_b32_e32 v0, v0, v4 -; TONGA-NEXT: v_cvt_f32_i32_e32 v0, v0 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; TONGA-NEXT: v_xor_b32_e32 v1, v1, v3 -; TONGA-NEXT: v_ashrrev_i32_e32 v1, 30, v1 -; TONGA-NEXT: v_or_b32_e32 v1, 1, v1 -; TONGA-NEXT: v_mul_f32_e32 v3, v0, v4 -; TONGA-NEXT: v_trunc_f32_e32 v3, v3 -; TONGA-NEXT: v_mad_f32 v0, -v3, v2, v0 -; TONGA-NEXT: v_cvt_i32_f32_e32 v3, v3 -; TONGA-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v2| -; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; TONGA-NEXT: v_or_b32_e32 v3, v3, v4 +; TONGA-NEXT: v_cvt_f32_i32_e32 v3, v3 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v4, v1 +; TONGA-NEXT: v_xor_b32_e32 v0, v2, v0 +; TONGA-NEXT: v_ashrrev_i32_e32 v0, 30, v0 +; TONGA-NEXT: v_or_b32_e32 v0, 1, v0 +; TONGA-NEXT: v_mul_f32_e32 v2, v3, v4 +; TONGA-NEXT: v_trunc_f32_e32 v2, v2 +; TONGA-NEXT: v_mad_f32 v3, -v2, v1, v3 +; TONGA-NEXT: v_cvt_i32_f32_e32 v2, v2 +; TONGA-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| +; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v2, v0 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 24 -; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; TONGA-NEXT: s_endpgm ; ; GFX9-LABEL: v_sdiv_i24: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s6, s2 -; GFX9-NEXT: s_mov_b32 s7, s3 -; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; GFX9-NEXT: buffer_load_sbyte v1, off, s[4:7], 0 offset:2 -; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:4 -; GFX9-NEXT: buffer_load_sbyte v3, off, s[4:7], 0 offset:6 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:6 +; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4 +; GFX9-NEXT: buffer_load_sbyte v2, off, s[8:11], 0 offset:2 +; GFX9-NEXT: buffer_load_ushort v3, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX9-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v1 -; GFX9-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GFX9-NEXT: v_xor_b32_e32 v1, v1, v3 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 30, v1 -; GFX9-NEXT: v_or_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_mul_f32_e32 v3, v0, v4 -; GFX9-NEXT: v_trunc_f32_e32 v3, v3 -; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v3 -; GFX9-NEXT: v_mad_f32 v0, -v3, v2, v0 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v2| -; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v3 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v1 +; GFX9-NEXT: v_xor_b32_e32 v0, v2, v0 +; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v0 +; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_mul_f32_e32 v2, v3, v4 +; GFX9-NEXT: v_trunc_f32_e32 v2, v2 +; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v2 +; GFX9-NEXT: v_mad_f32 v2, -v2, v1, v3 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX9-NEXT: v_add_u32_e32 v0, v4, v0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 24 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v_sdiv_i24: @@ -2067,17 +2082,17 @@ ; ; GFX9-LABEL: v_sdiv_i25: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfe_i32 v2, v1, 0, 25 ; GFX9-NEXT: v_bfe_i32 v1, v1, 24, 1 @@ -2110,7 +2125,7 @@ ; GFX9-NEXT: v_xor_b32_e32 v1, v1, v0 ; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 25 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v_sdiv_i25: @@ -2199,21 +2214,21 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) { ; GCN-LABEL: scalarize_mulhs_4xi32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -; GCN-NEXT: s_mov_b32 s4, 0x1389c755 -; GCN-NEXT: s_mov_b32 s0, s6 -; GCN-NEXT: s_mov_b32 s1, s7 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NEXT: s_mov_b32 s0, 0x1389c755 +; GCN-NEXT: s_mov_b32 s4, s2 +; GCN-NEXT: s_mov_b32 s5, s3 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_hi_i32 v0, v0, s4 -; GCN-NEXT: v_mul_hi_i32 v1, v1, s4 -; GCN-NEXT: v_mul_hi_i32 v2, v2, s4 -; GCN-NEXT: v_mul_hi_i32 v3, v3, s4 +; GCN-NEXT: v_mul_hi_i32 v0, v0, s0 +; GCN-NEXT: v_mul_hi_i32 v1, v1, s0 +; GCN-NEXT: v_mul_hi_i32 v2, v2, s0 +; GCN-NEXT: v_mul_hi_i32 v3, v3, s0 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 31, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 12, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 31, v1 @@ -2226,26 +2241,26 @@ ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v5 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; TONGA-LABEL: scalarize_mulhs_4xi32: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; TONGA-NEXT: s_mov_b32 s3, 0xf000 -; TONGA-NEXT: s_mov_b32 s2, -1 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; TONGA-NEXT: s_mov_b32 s7, 0xf000 +; TONGA-NEXT: s_mov_b32 s6, -1 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: s_mov_b32 s0, s4 -; TONGA-NEXT: s_mov_b32 s1, s5 -; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -; TONGA-NEXT: s_mov_b32 s4, 0x1389c755 -; TONGA-NEXT: s_mov_b32 s0, s6 -; TONGA-NEXT: s_mov_b32 s1, s7 +; TONGA-NEXT: s_mov_b32 s4, s0 +; TONGA-NEXT: s_mov_b32 s5, s1 +; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; TONGA-NEXT: s_mov_b32 s0, 0x1389c755 +; TONGA-NEXT: s_mov_b32 s4, s2 +; TONGA-NEXT: s_mov_b32 s5, s3 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_mul_hi_i32 v0, v0, s4 -; TONGA-NEXT: v_mul_hi_i32 v1, v1, s4 -; TONGA-NEXT: v_mul_hi_i32 v2, v2, s4 -; TONGA-NEXT: v_mul_hi_i32 v3, v3, s4 +; TONGA-NEXT: v_mul_hi_i32 v0, v0, s0 +; TONGA-NEXT: v_mul_hi_i32 v1, v1, s0 +; TONGA-NEXT: v_mul_hi_i32 v2, v2, s0 +; TONGA-NEXT: v_mul_hi_i32 v3, v3, s0 ; TONGA-NEXT: v_lshrrev_b32_e32 v4, 31, v0 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 12, v0 ; TONGA-NEXT: v_lshrrev_b32_e32 v5, 31, v1 @@ -2258,26 +2273,26 @@ ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v5 ; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v7 -; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; TONGA-NEXT: s_endpgm ; ; GFX9-LABEL: scalarize_mulhs_4xi32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -; GFX9-NEXT: s_mov_b32 s4, 0x1389c755 -; GFX9-NEXT: s_mov_b32 s0, s6 -; GFX9-NEXT: s_mov_b32 s1, s7 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s0, 0x1389c755 +; GFX9-NEXT: s_mov_b32 s4, s2 +; GFX9-NEXT: s_mov_b32 s5, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mul_hi_i32 v0, v0, s4 -; GFX9-NEXT: v_mul_hi_i32 v1, v1, s4 -; GFX9-NEXT: v_mul_hi_i32 v2, v2, s4 -; GFX9-NEXT: v_mul_hi_i32 v3, v3, s4 +; GFX9-NEXT: v_mul_hi_i32 v0, v0, s0 +; GFX9-NEXT: v_mul_hi_i32 v1, v1, s0 +; GFX9-NEXT: v_mul_hi_i32 v2, v2, s0 +; GFX9-NEXT: v_mul_hi_i32 v3, v3, s0 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 12, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 31, v1 @@ -2290,7 +2305,7 @@ ; GFX9-NEXT: v_add_u32_e32 v1, v1, v5 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v6 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v7 -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: scalarize_mulhs_4xi32: diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -592,58 +592,56 @@ define amdgpu_kernel void @s_test_sdiv32_64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv32_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dword s6, s[0:1], 0xe -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s7 +; GCN-NEXT: s_load_dword s8, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s6 -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_xor_b32 s4, s7, s6 -; GCN-NEXT: s_ashr_i32 s4, s4, 30 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s3 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_xor_b32 s0, s3, s8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_or_b32 s4, s4, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s4 -; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-NEXT: s_or_b32 s0, s0, 1 +; GCN-NEXT: v_mov_b32_e32 v3, s0 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_sdiv32_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_load_dword s6, s[0:1], 0xe -; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s2, -1 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s7 +; GCN-IR-NEXT: s_load_dword s8, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s6 -; GCN-IR-NEXT: s_mov_b32 s0, s4 -; GCN-IR-NEXT: s_xor_b32 s4, s7, s6 -; GCN-IR-NEXT: s_ashr_i32 s4, s4, 30 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s3 +; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_xor_b32 s0, s3, s8 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_or_b32 s4, s4, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s4 -; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-IR-NEXT: s_or_b32 s0, s0, 1 +; GCN-IR-NEXT: v_mov_b32_e32 v3, s0 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr i64 %x, 32 %2 = ashr i64 %y, 32 @@ -856,37 +854,37 @@ define amdgpu_kernel void @s_test_sdiv24_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { ; GCN-LABEL: s_test_sdiv24_v2i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x11 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 ; GCN-NEXT: s_ashr_i64 s[8:9], s[8:9], 40 -; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], 40 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s8 -; GCN-NEXT: s_xor_b32 s0, s8, s0 -; GCN-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GCN-NEXT: s_xor_b32 s4, s4, s8 +; GCN-NEXT: s_ashr_i32 s4, s4, 30 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_or_b32 s0, s0, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s0 -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 +; GCN-NEXT: s_or_b32 s4, s4, 1 +; GCN-NEXT: v_mov_b32_e32 v3, s4 +; GCN-NEXT: s_ashr_i64 s[10:11], s[10:11], 40 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: s_ashr_i64 s[10:11], s[10:11], 40 +; GCN-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_cvt_f32_i32_e32 v2, s2 -; GCN-NEXT: v_cvt_f32_i32_e32 v3, s10 -; GCN-NEXT: s_xor_b32 s0, s10, s2 -; GCN-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-NEXT: v_cvt_f32_i32_e32 v2, s10 +; GCN-NEXT: v_cvt_f32_i32_e32 v3, s6 +; GCN-NEXT: s_xor_b32 s4, s6, s10 +; GCN-NEXT: s_ashr_i32 s4, s4, 30 ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GCN-NEXT: s_or_b32 s0, s0, 1 -; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: s_or_b32 s4, s4, 1 +; GCN-NEXT: v_mov_b32_e32 v5, s4 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 ; GCN-NEXT: v_trunc_f32_e32 v4, v4 @@ -898,42 +896,42 @@ ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_bfe_i32 v2, v2, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_sdiv24_v2i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GCN-IR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x11 +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 ; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[8:9], 40 -; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[0:1], 40 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s8 -; GCN-IR-NEXT: s_xor_b32 s0, s8, s0 -; GCN-IR-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GCN-IR-NEXT: s_xor_b32 s4, s4, s8 +; GCN-IR-NEXT: s_ashr_i32 s4, s4, 30 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_or_b32 s0, s0, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s0 -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 +; GCN-IR-NEXT: s_or_b32 s4, s4, 1 +; GCN-IR-NEXT: v_mov_b32_e32 v3, s4 +; GCN-IR-NEXT: s_ashr_i64 s[10:11], s[10:11], 40 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-IR-NEXT: s_ashr_i64 s[10:11], s[10:11], 40 +; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v2, s2 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v3, s10 -; GCN-IR-NEXT: s_xor_b32 s0, s10, s2 -; GCN-IR-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v2, s10 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v3, s6 +; GCN-IR-NEXT: s_xor_b32 s4, s6, s10 +; GCN-IR-NEXT: s_ashr_i32 s4, s4, 30 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GCN-IR-NEXT: s_or_b32 s0, s0, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v5, s0 +; GCN-IR-NEXT: s_or_b32 s4, s4, 1 +; GCN-IR-NEXT: v_mov_b32_e32 v5, s4 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_mul_f32_e32 v4, v3, v4 ; GCN-IR-NEXT: v_trunc_f32_e32 v4, v4 @@ -945,7 +943,7 @@ ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-IR-NEXT: v_bfe_i32 v2, v2, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr <2 x i64> %x, %2 = ashr <2 x i64> %y, @@ -960,15 +958,15 @@ ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: s_load_dword s2, s[0:1], 0xb ; GCN-NEXT: s_load_dword s3, s[0:1], 0xc -; GCN-NEXT: s_load_dword s8, s[0:1], 0xd -; GCN-NEXT: s_load_dword s0, s[0:1], 0xe +; GCN-NEXT: s_load_dword s8, s[0:1], 0xe +; GCN-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: s_sext_i32_i16 s1, s3 -; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: s_sext_i32_i16 s0, s0 -; GCN-NEXT: v_alignbit_b32 v0, s0, v0, 24 +; GCN-NEXT: s_sext_i32_i16 s3, s8 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_alignbit_b32 v0, s3, v0, 24 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v0 ; GCN-NEXT: v_alignbit_b32 v2, s1, v2, 24 ; GCN-NEXT: v_cvt_f32_i32_e32 v3, v2 @@ -992,26 +990,26 @@ ; ; GCN-IR-LABEL: s_test_sdiv24_48: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xb ; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xc -; GCN-IR-NEXT: s_load_dword s6, s[0:1], 0xd -; GCN-IR-NEXT: s_load_dword s0, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dword s5, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-IR-NEXT: s_load_dword s4, s[0:1], 0xd +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sext_i32_i16 s3, s3 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[2:3], 24 -; GCN-IR-NEXT: s_sext_i32_i16 s7, s0 -; GCN-IR-NEXT: s_ashr_i32 s0, s3, 31 -; GCN-IR-NEXT: s_mov_b32 s1, s0 -; GCN-IR-NEXT: s_ashr_i64 s[12:13], s[6:7], 24 -; GCN-IR-NEXT: s_ashr_i32 s2, s7, 31 -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[0:1], s[8:9] +; GCN-IR-NEXT: s_sext_i32_i16 s5, s5 +; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[2:3], 24 +; GCN-IR-NEXT: s_ashr_i32 s2, s3, 31 ; GCN-IR-NEXT: s_mov_b32 s3, s2 -; GCN-IR-NEXT: s_sub_u32 s10, s6, s0 -; GCN-IR-NEXT: s_subb_u32 s11, s7, s0 -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[2:3], s[12:13] -; GCN-IR-NEXT: s_sub_u32 s6, s6, s2 -; GCN-IR-NEXT: s_subb_u32 s7, s7, s2 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 24 +; GCN-IR-NEXT: s_ashr_i32 s4, s5, 31 +; GCN-IR-NEXT: s_xor_b64 s[6:7], s[2:3], s[6:7] +; GCN-IR-NEXT: s_mov_b32 s5, s4 +; GCN-IR-NEXT: s_sub_u32 s10, s6, s2 +; GCN-IR-NEXT: s_subb_u32 s11, s7, s2 +; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], s[8:9] +; GCN-IR-NEXT: s_sub_u32 s6, s6, s4 +; GCN-IR-NEXT: s_subb_u32 s7, s7, s4 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[6:7], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[10:11], 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 @@ -1089,16 +1087,16 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 ; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[18:19] ; GCN-IR-NEXT: .LBB9_6: ; %udiv-end -; GCN-IR-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] -; GCN-IR-NEXT: v_xor_b32_e32 v0, s0, v0 -; GCN-IR-NEXT: v_xor_b32_e32 v1, s1, v1 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s1 -; GCN-IR-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 +; GCN-IR-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3] +; GCN-IR-NEXT: v_xor_b32_e32 v0, s2, v0 +; GCN-IR-NEXT: v_xor_b32_e32 v1, s3, v1 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 +; GCN-IR-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 -; GCN-IR-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 +; GCN-IR-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr i48 %x, 24 %2 = ashr i48 %y, 24 @@ -1881,54 +1879,54 @@ define amdgpu_kernel void @s_test_sdiv24_k_den_i64(i64 addrspace(1)* %out, i64 %x) { ; GCN-LABEL: s_test_sdiv24_k_den_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s8, 0x46b6fe00 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s6 -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_ashr_i32 s4, s6, 30 -; GCN-NEXT: s_or_b32 s4, s4, 1 +; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_ashr_i32 s0, s2, 30 +; GCN-NEXT: s_or_b32 s0, s0, 1 ; GCN-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_mad_f32 v0, -v1, s8, v0 ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s8 ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 -; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_sdiv24_k_den_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b32 s8, 0x46b6fe00 -; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s6 -; GCN-IR-NEXT: s_mov_b32 s0, s4 -; GCN-IR-NEXT: s_ashr_i32 s4, s6, 30 -; GCN-IR-NEXT: s_or_b32 s4, s4, 1 +; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_ashr_i32 s0, s2, 30 +; GCN-IR-NEXT: s_or_b32 s0, s0, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_mad_f32 v0, -v1, s8, v0 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s4 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s0 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s8 ; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 -; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %x.shr = ashr i64 %x, 40 %result = sdiv i64 %x.shr, 23423 diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -186,8 +186,8 @@ } ; GCN-LABEL: {{^}}mul_v2half: -; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} ; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} ; NOSDWA: v_mul_f16_e32 v[[DST_MUL:[0-9]+]], v[[DST0]], v[[DST1]] ; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]] ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[DST_SHL]] @@ -413,8 +413,8 @@ ; GCN-LABEL: {{^}}mac_v2half: -; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} ; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} ; NOSDWA: v_mac_f16_e32 v[[DST_MAC:[0-9]+]], v[[DST0]], v[[DST1]] ; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]] ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[DST_SHL]] diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -6,9 +6,9 @@ ; SI-LABEL: select_f16: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 ; SI-NEXT: s_mov_b32 s18, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s16, s6 @@ -47,9 +47,9 @@ ; VI-LABEL: select_f16: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44 ; VI-NEXT: s_mov_b32 s18, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s16, s6 @@ -433,42 +433,42 @@ ; SI-NEXT: s_mov_b32 s21, s9 ; SI-NEXT: s_mov_b32 s22, s2 ; SI-NEXT: s_mov_b32 s23, s3 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; SI-NEXT: s_mov_b32 s8, s10 ; SI-NEXT: s_mov_b32 s9, s11 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; SI-NEXT: s_mov_b32 s10, s2 ; SI-NEXT: s_mov_b32 s11, s3 -; SI-NEXT: s_mov_b32 s14, s2 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: buffer_load_dword v0, off, s[16:19], 0 -; SI-NEXT: buffer_load_dword v1, off, s[20:23], 0 -; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v2, off, s[20:23], 0 ; SI-NEXT: buffer_load_dword v3, off, s[8:11], 0 ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v5, v6 -; SI-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc -; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; SI-NEXT: v_cmp_lt_f32_e32 vcc, v1, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -478,10 +478,12 @@ ; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s18, s2 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s16, s6 ; VI-NEXT: s_mov_b32 s17, s7 +; VI-NEXT: s_mov_b32 s18, s2 ; VI-NEXT: s_mov_b32 s19, s3 ; VI-NEXT: s_mov_b32 s20, s8 ; VI-NEXT: s_mov_b32 s21, s9 @@ -490,24 +492,22 @@ ; VI-NEXT: s_mov_b32 s8, s10 ; VI-NEXT: s_mov_b32 s9, s11 ; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 -; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; VI-NEXT: buffer_load_dword v1, off, s[20:23], 0 -; VI-NEXT: buffer_load_dword v2, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v2, off, s[16:19], 0 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_dword v3, off, s[8:11], 0 ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; VI-NEXT: v_cmp_lt_f16_e32 vcc, v2, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v6, v5 ; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc @@ -546,11 +546,11 @@ ; SI-NEXT: s_mov_b32 s17, s5 ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s2, 0x3f200000 @@ -558,12 +558,12 @@ ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 @@ -645,11 +645,11 @@ ; SI-NEXT: s_mov_b32 s17, s5 ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s2, 0x3f200000 @@ -657,12 +657,12 @@ ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 @@ -744,11 +744,11 @@ ; SI-NEXT: s_mov_b32 s17, s5 ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 ; SI-NEXT: v_mov_b32_e32 v3, 0x3f200000 @@ -757,11 +757,11 @@ ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 @@ -782,31 +782,31 @@ ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s2 -; VI-NEXT: s_mov_b32 s13, s3 ; VI-NEXT: s_mov_b32 s16, s4 ; VI-NEXT: s_mov_b32 s17, s5 -; VI-NEXT: s_mov_b32 s18, s10 -; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_mov_b32 s4, s6 ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 ; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 ; VI-NEXT: v_mov_b32_e32 v3, 0x3800 ; VI-NEXT: v_mov_b32_e32 v4, 0x3900 ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v1, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 @@ -845,11 +845,11 @@ ; SI-NEXT: s_mov_b32 s17, s5 ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 ; SI-NEXT: v_mov_b32_e32 v3, 0x3f200000 @@ -857,13 +857,13 @@ ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 @@ -883,31 +883,31 @@ ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s2 -; VI-NEXT: s_mov_b32 s13, s3 ; VI-NEXT: s_mov_b32 s16, s4 ; VI-NEXT: s_mov_b32 s17, s5 -; VI-NEXT: s_mov_b32 s18, s10 -; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_mov_b32 s4, s6 ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 ; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 ; VI-NEXT: v_mov_b32_e32 v3, 0x3800 ; VI-NEXT: v_mov_b32_e32 v4, 0x3900 ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; VI-NEXT: v_cmp_lt_f16_e32 vcc, v1, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 diff --git a/llvm/test/CodeGen/AMDGPU/select64.ll b/llvm/test/CodeGen/AMDGPU/select64.ll --- a/llvm/test/CodeGen/AMDGPU/select64.ll +++ b/llvm/test/CodeGen/AMDGPU/select64.ll @@ -6,47 +6,47 @@ define amdgpu_kernel void @select0(i64 addrspace(1)* %out, i32 %cond, i64 %in) { ; SI-LABEL: select0: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lt_u32 s2, 6 -; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_cmp_lt_u32 s6, 6 +; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc -; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: select0: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_cmp_lt_u32 s4, 6 -; VI-NEXT: s_cselect_b64 s[0:1], s[0:1], 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX90A-LABEL: select0: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dword s6, s[0:1], 0x2c -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cmp_lt_u32 s6, 6 -; GFX90A-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 +; GFX90A-NEXT: s_cselect_b64 s[0:1], s[2:3], 0 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX90A-NEXT: s_endpgm entry: %0 = icmp ugt i32 %cond, 5 @@ -58,38 +58,38 @@ define amdgpu_kernel void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i64 %in) nounwind { ; SI-LABEL: select_trunc_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dword s0, s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dword s5, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lt_u32 s2, 6 -; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: s_cmp_lt_u32 s4, 6 +; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: select_trunc_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dword s3, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_cmp_lt_u32 s4, 6 -; VI-NEXT: s_cselect_b32 s0, s0, 0 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_cmp_lt_u32 s2, 6 +; VI-NEXT: s_cselect_b32 s2, s3, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX90A-LABEL: select_trunc_i64: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX90A-NEXT: s_load_dword s5, s[0:1], 0x34 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cmp_lt_u32 s4, 6 @@ -107,39 +107,39 @@ define amdgpu_kernel void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 %b) nounwind { ; SI-LABEL: select_trunc_i64_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dword s8, s[0:1], 0xb -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_gt_u32 s8, 5 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: select_trunc_i64_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_cmp_gt_u32 s6, 5 -; VI-NEXT: s_cselect_b32 s0, s0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_cmp_gt_u32 s2, 5 +; VI-NEXT: s_cselect_b32 s2, s4, s6 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX90A-LABEL: select_trunc_i64_2: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x2c ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cmp_gt_u32 s8, 5 @@ -157,52 +157,52 @@ define amdgpu_kernel void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { ; SI-LABEL: v_select_trunc_i64_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s1, s[8:9], 0x0 -; SI-NEXT: s_load_dword s2, s[10:11], 0x0 +; SI-NEXT: s_load_dword s1, s[6:7], 0x0 +; SI-NEXT: s_load_dword s2, s[4:5], 0x0 ; SI-NEXT: s_cmp_gt_u32 s0, 5 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_select_trunc_i64_2: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s1, s[4:5], 0x0 +; VI-NEXT: s_load_dword s3, s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_cmp_gt_u32 s0, 5 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_cmp_gt_u32 s2, 5 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cselect_b32 s0, s1, s4 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_cselect_b32 s2, s3, s4 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX90A-LABEL: v_select_trunc_i64_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x2c ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX90A-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX90A-NEXT: s_load_dword s9, s[4:5], 0x0 +; GFX90A-NEXT: s_load_dword s10, s[6:7], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_cmp_gt_u32 s8, 5 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_cselect_b32 s0, s0, s1 +; GFX90A-NEXT: s_cselect_b32 s0, s9, s10 ; GFX90A-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-NEXT: s_endpgm @@ -218,57 +218,57 @@ define amdgpu_kernel void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { ; SI-LABEL: v_select_i64_split_imm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SI-NEXT: s_cmp_gt_u32 s2, 5 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_cmp_gt_u32 s6, 5 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_cndmask_b32_e32 v1, 63, v0, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_select_i64_split_imm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_load_dword s6, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s4, 0 ; VI-NEXT: s_mov_b32 s5, 63 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_cmp_gt_u32 s6, 5 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX90A-LABEL: v_select_i64_split_imm: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX90A-NEXT: s_load_dword s6, s[0:1], 0x2c -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NEXT: s_mov_b32 s4, 0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX90A-NEXT: s_mov_b32 s2, 0 ; GFX90A-NEXT: s_cmp_gt_u32 s6, 5 -; GFX90A-NEXT: s_mov_b32 s5, 63 +; GFX90A-NEXT: s_mov_b32 s3, 63 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] +; GFX90A-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX90A-NEXT: s_endpgm %cmp = icmp ugt i32 %cond, 5 %a = load i64, i64 addrspace(1)* %aptr, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll b/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll --- a/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll +++ b/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll @@ -4,15 +4,15 @@ define amdgpu_kernel void @sext_i16_to_i32_uniform(i32 addrspace(1)* %out, i16 %a, i32 %b) { ; GCN-LABEL: sext_i16_to_i32_uniform: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_sext_i32_i16 s0, s0 -; GCN-NEXT: s_add_i32 s0, s1, s0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_sext_i32_i16 s4, s4 +; GCN-NEXT: s_add_i32 s4, s5, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %sext = sext i16 %a to i32 %res = add i32 %b, %sext @@ -24,18 +24,18 @@ define amdgpu_kernel void @sext_i16_to_i64_uniform(i64 addrspace(1)* %out, i16 %a, i64 %b) { ; GCN-LABEL: sext_i16_to_i64_uniform: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s2, s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000 -; GCN-NEXT: s_add_u32 s0, s0, s2 -; GCN-NEXT: s_addc_u32 s1, s1, s3 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 +; GCN-NEXT: s_add_u32 s4, s6, s4 +; GCN-NEXT: s_addc_u32 s5, s7, s5 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm %sext = sext i16 %a to i64 %res = add i64 %b, %sext @@ -46,14 +46,14 @@ define amdgpu_kernel void @sext_i16_to_i32_divergent(i32 addrspace(1)* %out, i16 %a, i32 %b) { ; GCN-LABEL: sext_i16_to_i32_divergent: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.truncated = trunc i32 %tid to i16 @@ -67,15 +67,15 @@ define amdgpu_kernel void @sext_i16_to_i64_divergent(i64 addrspace(1)* %out, i16 %a, i64 %b) { ; GCN-LABEL: sext_i16_to_i64_divergent: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.truncated = trunc i32 %tid to i16 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll --- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -11,8 +11,8 @@ define amdgpu_kernel void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { ; SI-LABEL: sgpr_if_else_salu_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dword s0, s[0:1], 0xf ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s8, 0 @@ -106,9 +106,9 @@ define amdgpu_kernel void @sgpr_if_else_valu_br(i32 addrspace(1)* %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) { ; SI-LABEL: sgpr_if_else_valu_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: v_cvt_f32_u32_e32 v0, v0 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xc +; SI-NEXT: v_cvt_f32_u32_e32 v0, v0 ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v0 ; SI-NEXT: s_and_saveexec_b64 s[6:7], vcc diff --git a/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll b/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll --- a/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll @@ -5,23 +5,23 @@ define amdgpu_kernel void @v_uextract_bit_31_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { ; GCN-LABEL: v_uextract_bit_31_i128: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_ashr_i32 s3, s2, 31 ; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 4 ; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b64 s[4:5], s[10:11] +; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] ; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: buffer_load_dword v0, v[4:5], s[4:7], 0 addr64 +; GCN-NEXT: buffer_load_dword v0, v[4:5], s[8:11], 0 addr64 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] +; GCN-NEXT: s_mov_b64 s[6:7], s[10:11] ; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_lshrrev_b32_e32 v0, 31, v0 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[8:11], 0 addr64 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x @@ -132,22 +132,22 @@ define amdgpu_kernel void @v_uextract_bit_34_100_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { ; GCN-LABEL: v_uextract_bit_34_100_i128: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v8, 4, v0 ; GCN-NEXT: v_mov_b32_e32 v9, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b64 s[0:1], s[6:7] -; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[8:9], s[0:3], 0 addr64 -; GCN-NEXT: s_mov_b64 s[6:7], s[2:3] +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] ; GCN-NEXT: v_mov_b32_e32 v7, v9 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_lshl_b64 v[4:5], v[2:3], 30 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 2, v1 ; GCN-NEXT: v_bfe_u32 v6, v3, 2, 2 ; GCN-NEXT: v_or_b32_e32 v4, v0, v4 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[8:9], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[8:9], s[0:3], 0 addr64 ; GCN-NEXT: s_endpgm %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -451,59 +451,59 @@ define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_shl_v2i128ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8 -; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 +; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v10, 16 ; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: v_mov_b32_e32 v11, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64 -; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[18:19], 0 -; GCN-NEXT: s_sub_i32 s6, 64, s16 -; GCN-NEXT: s_sub_i32 s4, s16, 64 -; GCN-NEXT: s_lshr_b64 s[6:7], s[8:9], s6 -; GCN-NEXT: s_lshl_b64 s[24:25], s[10:11], s16 -; GCN-NEXT: s_lshl_b64 s[4:5], s[8:9], s4 +; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], 64 +; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[10:11], 0 +; GCN-NEXT: s_sub_i32 s6, 64, s8 +; GCN-NEXT: s_sub_i32 s4, s8, 64 +; GCN-NEXT: s_lshr_b64 s[6:7], s[16:17], s6 +; GCN-NEXT: s_lshl_b64 s[24:25], s[18:19], s8 +; GCN-NEXT: s_lshl_b64 s[4:5], s[16:17], s4 ; GCN-NEXT: s_or_b64 s[6:7], s[24:25], s[6:7] ; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] -; GCN-NEXT: s_or_b64 s[0:1], s[16:17], s[18:19] +; GCN-NEXT: s_or_b64 s[0:1], s[8:9], s[10:11] ; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s11 +; GCN-NEXT: v_mov_b32_e32 v1, s19 ; GCN-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s10 +; GCN-NEXT: v_mov_b32_e32 v1, s18 ; GCN-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] -; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[20:21], 64 -; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[22:23], 0 -; GCN-NEXT: s_sub_i32 s6, 64, s20 -; GCN-NEXT: s_sub_i32 s4, s20, 64 -; GCN-NEXT: s_lshr_b64 s[6:7], s[12:13], s6 -; GCN-NEXT: s_lshl_b64 s[10:11], s[14:15], s20 -; GCN-NEXT: s_lshl_b64 s[4:5], s[12:13], s4 +; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[12:13], 64 +; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[14:15], 0 +; GCN-NEXT: s_sub_i32 s6, 64, s12 +; GCN-NEXT: s_sub_i32 s4, s12, 64 +; GCN-NEXT: s_lshr_b64 s[6:7], s[20:21], s6 +; GCN-NEXT: s_lshl_b64 s[10:11], s[22:23], s12 +; GCN-NEXT: s_lshl_b64 s[4:5], s[20:21], s4 ; GCN-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] ; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] -; GCN-NEXT: s_or_b64 s[2:3], s[20:21], s[22:23] +; GCN-NEXT: s_or_b64 s[2:3], s[12:13], s[14:15] ; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], 0 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v1, s15 +; GCN-NEXT: v_mov_b32_e32 v1, s23 ; GCN-NEXT: v_cndmask_b32_e64 v7, v0, v1, s[2:3] ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v1, s14 +; GCN-NEXT: v_mov_b32_e32 v1, s22 ; GCN-NEXT: v_cndmask_b32_e64 v6, v0, v1, s[2:3] -; GCN-NEXT: s_lshl_b64 s[2:3], s[8:9], s16 +; GCN-NEXT: s_lshl_b64 s[2:3], s[16:17], s8 ; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc ; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: s_lshl_b64 s[2:3], s[12:13], s20 +; GCN-NEXT: s_lshl_b64 s[2:3], s[20:21], s12 ; GCN-NEXT: v_mov_b32_e32 v4, s3 ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, v4, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v4, s2 @@ -521,59 +521,59 @@ define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_lshr_v2i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8 -; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 +; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v10, 16 ; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: v_mov_b32_e32 v11, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64 -; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[18:19], 0 -; GCN-NEXT: s_sub_i32 s6, 64, s16 -; GCN-NEXT: s_sub_i32 s4, s16, 64 -; GCN-NEXT: s_lshl_b64 s[6:7], s[10:11], s6 -; GCN-NEXT: s_lshr_b64 s[24:25], s[8:9], s16 -; GCN-NEXT: s_lshr_b64 s[4:5], s[10:11], s4 +; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], 64 +; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[10:11], 0 +; GCN-NEXT: s_sub_i32 s6, 64, s8 +; GCN-NEXT: s_sub_i32 s4, s8, 64 +; GCN-NEXT: s_lshl_b64 s[6:7], s[18:19], s6 +; GCN-NEXT: s_lshr_b64 s[24:25], s[16:17], s8 +; GCN-NEXT: s_lshr_b64 s[4:5], s[18:19], s4 ; GCN-NEXT: s_or_b64 s[6:7], s[24:25], s[6:7] ; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] -; GCN-NEXT: s_or_b64 s[0:1], s[16:17], s[18:19] +; GCN-NEXT: s_or_b64 s[0:1], s[8:9], s[10:11] ; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NEXT: v_mov_b32_e32 v1, s17 ; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NEXT: v_mov_b32_e32 v2, s16 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[20:21], 64 -; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[22:23], 0 -; GCN-NEXT: s_sub_i32 s6, 64, s20 -; GCN-NEXT: s_sub_i32 s4, s20, 64 -; GCN-NEXT: s_lshl_b64 s[6:7], s[14:15], s6 -; GCN-NEXT: s_lshr_b64 s[8:9], s[12:13], s20 -; GCN-NEXT: s_lshr_b64 s[4:5], s[14:15], s4 -; GCN-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] +; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[12:13], 64 +; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[14:15], 0 +; GCN-NEXT: s_sub_i32 s6, 64, s12 +; GCN-NEXT: s_sub_i32 s4, s12, 64 +; GCN-NEXT: s_lshl_b64 s[6:7], s[22:23], s6 +; GCN-NEXT: s_lshr_b64 s[10:11], s[20:21], s12 +; GCN-NEXT: s_lshr_b64 s[4:5], s[22:23], s4 +; GCN-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] ; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] -; GCN-NEXT: s_or_b64 s[2:3], s[20:21], s[22:23] +; GCN-NEXT: s_or_b64 s[2:3], s[12:13], s[14:15] ; GCN-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NEXT: v_mov_b32_e32 v3, s7 ; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], 0 ; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NEXT: v_mov_b32_e32 v3, s21 ; GCN-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[2:3] ; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: v_mov_b32_e32 v3, s6 ; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v3, s12 +; GCN-NEXT: v_mov_b32_e32 v3, s20 ; GCN-NEXT: v_cndmask_b32_e64 v4, v2, v3, s[2:3] -; GCN-NEXT: s_lshr_b64 s[2:3], s[10:11], s16 +; GCN-NEXT: s_lshr_b64 s[2:3], s[18:19], s8 ; GCN-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc ; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: s_lshr_b64 s[2:3], s[14:15], s20 +; GCN-NEXT: s_lshr_b64 s[2:3], s[22:23], s12 ; GCN-NEXT: v_mov_b32_e32 v6, s3 ; GCN-NEXT: v_cndmask_b32_e64 v7, 0, v6, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v6, s2 @@ -591,61 +591,61 @@ define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_ashr_v2i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8 -; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 +; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v10, 16 ; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: v_mov_b32_e32 v11, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64 -; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[18:19], 0 -; GCN-NEXT: s_sub_i32 s6, 64, s16 -; GCN-NEXT: s_sub_i32 s4, s16, 64 -; GCN-NEXT: s_lshl_b64 s[6:7], s[10:11], s6 -; GCN-NEXT: s_lshr_b64 s[24:25], s[8:9], s16 -; GCN-NEXT: s_ashr_i64 s[4:5], s[10:11], s4 +; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], 64 +; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[10:11], 0 +; GCN-NEXT: s_sub_i32 s6, 64, s8 +; GCN-NEXT: s_sub_i32 s4, s8, 64 +; GCN-NEXT: s_lshl_b64 s[6:7], s[18:19], s6 +; GCN-NEXT: s_lshr_b64 s[24:25], s[16:17], s8 +; GCN-NEXT: s_ashr_i64 s[4:5], s[18:19], s4 ; GCN-NEXT: s_or_b64 s[6:7], s[24:25], s[6:7] ; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] -; GCN-NEXT: s_or_b64 s[0:1], s[16:17], s[18:19] +; GCN-NEXT: s_or_b64 s[0:1], s[8:9], s[10:11] ; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NEXT: v_mov_b32_e32 v1, s17 ; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NEXT: v_mov_b32_e32 v2, s16 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[20:21], 64 -; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[22:23], 0 -; GCN-NEXT: s_sub_i32 s6, 64, s20 -; GCN-NEXT: s_sub_i32 s4, s20, 64 -; GCN-NEXT: s_lshl_b64 s[6:7], s[14:15], s6 -; GCN-NEXT: s_lshr_b64 s[8:9], s[12:13], s20 -; GCN-NEXT: s_ashr_i64 s[4:5], s[14:15], s4 -; GCN-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] +; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[12:13], 64 +; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[14:15], 0 +; GCN-NEXT: s_sub_i32 s6, 64, s12 +; GCN-NEXT: s_sub_i32 s4, s12, 64 +; GCN-NEXT: s_lshl_b64 s[6:7], s[22:23], s6 +; GCN-NEXT: s_lshr_b64 s[10:11], s[20:21], s12 +; GCN-NEXT: s_ashr_i64 s[4:5], s[22:23], s4 +; GCN-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] ; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] -; GCN-NEXT: s_or_b64 s[2:3], s[20:21], s[22:23] +; GCN-NEXT: s_or_b64 s[2:3], s[12:13], s[14:15] ; GCN-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NEXT: v_mov_b32_e32 v3, s7 ; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], 0 ; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NEXT: v_mov_b32_e32 v3, s21 ; GCN-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[2:3] ; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: v_mov_b32_e32 v3, s6 ; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v3, s12 +; GCN-NEXT: v_mov_b32_e32 v3, s20 ; GCN-NEXT: v_cndmask_b32_e64 v4, v2, v3, s[2:3] -; GCN-NEXT: s_ashr_i32 s4, s11, 31 -; GCN-NEXT: s_ashr_i64 s[2:3], s[10:11], s16 +; GCN-NEXT: s_ashr_i32 s4, s19, 31 +; GCN-NEXT: s_ashr_i64 s[2:3], s[18:19], s8 ; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: v_mov_b32_e32 v6, s2 -; GCN-NEXT: s_ashr_i32 s4, s15, 31 -; GCN-NEXT: s_ashr_i64 s[2:3], s[14:15], s20 +; GCN-NEXT: s_ashr_i32 s4, s23, 31 +; GCN-NEXT: s_ashr_i64 s[2:3], s[22:23], s12 ; GCN-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc ; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GCN-NEXT: v_mov_b32_e32 v6, s4 diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -10,38 +10,36 @@ define amdgpu_kernel void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { ; SI-LABEL: shl_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshl_b32_e32 v1, v1, v3 ; SI-NEXT: v_lshl_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: shl_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x8 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s4, s9, s7 -; VI-NEXT: s_lshl_b32 s5, s8, s6 -; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshl_b32 s5, s5, s7 +; VI-NEXT: s_lshl_b32 s4, s4, s6 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -74,41 +72,39 @@ define amdgpu_kernel void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { ; SI-LABEL: shl_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshl_b32_e32 v3, v3, v7 ; SI-NEXT: v_lshl_b32_e32 v2, v2, v6 ; SI-NEXT: v_lshl_b32_e32 v1, v1, v5 ; SI-NEXT: v_lshl_b32_e32 v0, v0, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: shl_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x10 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x10 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s7, s11, s7 -; VI-NEXT: s_lshl_b32 s6, s10, s6 -; VI-NEXT: s_lshl_b32 s5, s9, s5 -; VI-NEXT: s_lshl_b32 s4, s8, s4 +; VI-NEXT: s_lshl_b32 s7, s7, s11 +; VI-NEXT: s_lshl_b32 s6, s6, s10 +; VI-NEXT: s_lshl_b32 s5, s5, s9 +; VI-NEXT: s_lshl_b32 s4, s4, s8 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 @@ -147,40 +143,40 @@ define amdgpu_kernel void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { ; SI-LABEL: shl_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:2 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:2 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: shl_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; VI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:2 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:2 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: shl_i16: @@ -222,38 +218,38 @@ ; SI-LABEL: shl_i16_v_s: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dword s12, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, s8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, s12, v0 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: shl_i16_v_s: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dword s12, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v0, s8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, s12, v0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -295,40 +291,40 @@ ; SI-LABEL: shl_i16_v_compute_s: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dword s12, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, s8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, s12, v0 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: shl_i16_v_compute_s: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dword s12, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; VI-NEXT: s_add_i32 s12, s12, 3 ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; VI-NEXT: s_add_i32 s8, s8, 3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v0, s8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, s12, v0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -377,55 +373,55 @@ define amdgpu_kernel void @shl_i16_computed_amount(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { ; SI-LABEL: shl_i16_computed_amount: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s14, 0 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: s_mov_b64 s[12:13], s[6:7] +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: s_mov_b64 s[12:13], s[2:3] ; SI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[12:15], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: shl_i16_computed_amount: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_ushort v0, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_lshlrev_b16_e32 v0, v0, v2 -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: shl_i16_computed_amount: @@ -479,26 +475,26 @@ define amdgpu_kernel void @shl_i16_i_s(i16 addrspace(1)* %out, i16 zeroext %a) { ; SI-LABEL: shl_i16_i_s: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s0, s0, 12 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_lshl_b32 s4, s4, 12 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: shl_i16_i_s: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s0, s0, 12 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_lshl_b32 s4, s4, 12 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: shl_i16_i_s: @@ -537,23 +533,23 @@ define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { ; SI-LABEL: shl_v2i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s14, 0 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: s_mov_b64 s[12:13], s[6:7] +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: s_mov_b64 s[12:13], s[2:3] ; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 offset:4 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -563,7 +559,7 @@ ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: shl_v2i16: @@ -571,18 +567,18 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s4, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_load_dword s4, s[2:3], 0x0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s5, s4, 16 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e64 v2, v0, s4 ; VI-NEXT: v_lshlrev_b16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -638,17 +634,17 @@ define amdgpu_kernel void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { ; SI-LABEL: shl_v4i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[0:1], s[6:7] -; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 -; SI-NEXT: s_mov_b32 s0, 0xffff -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 +; SI-NEXT: s_mov_b32 s4, 0xffff +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 @@ -659,13 +655,13 @@ ; SI-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; SI-NEXT: v_lshlrev_b32_e32 v4, v9, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, v8, v6 -; SI-NEXT: v_and_b32_e32 v3, s0, v3 -; SI-NEXT: v_and_b32_e32 v2, s0, v2 +; SI-NEXT: v_and_b32_e32 v3, s4, v3 +; SI-NEXT: v_and_b32_e32 v2, s4, v2 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v5 -; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: shl_v4i16: @@ -781,36 +777,34 @@ define amdgpu_kernel void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { ; SI-LABEL: shl_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v2 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: shl_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s6 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: shl_i64: @@ -850,37 +844,35 @@ define amdgpu_kernel void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { ; SI-LABEL: shl_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v6 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: shl_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x10 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x10 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[6:7], s[10:11], s6 -; VI-NEXT: s_lshl_b64 s[4:5], s[8:9], s4 +; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], s10 +; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s8 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 @@ -944,48 +936,46 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s8, s6 ; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 -; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 +; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v10 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v6 +; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshl_b64 v[6:7], v[6:7], v13 -; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], v11 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v8 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: v_lshl_b64 v[9:10], v[9:10], v13 +; SI-NEXT: v_lshl_b64 v[7:8], v[7:8], v11 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: shl_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s12 -; VI-NEXT: s_mov_b32 s1, s13 -; VI-NEXT: s_load_dwordx8 s[4:11], s[14:15], 0x0 -; VI-NEXT: s_load_dwordx8 s[12:19], s[14:15], 0x20 +; VI-NEXT: s_load_dwordx8 s[0:7], s[18:19], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[18:19], 0x20 +; VI-NEXT: s_mov_b32 s19, 0xf000 +; VI-NEXT: s_mov_b32 s18, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[10:11], s[10:11], s18 -; VI-NEXT: s_lshl_b64 s[8:9], s[8:9], s16 ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], s14 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s12 -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; VI-NEXT: s_nop 0 +; VI-NEXT: s_lshl_b64 s[2:3], s[2:3], s10 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: shl_v4i64: @@ -1064,26 +1054,26 @@ define amdgpu_kernel void @s_shl_32_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) { ; SI-LABEL: s_shl_32_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0x13 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_shl_32_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x4c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_shl_32_i64: @@ -1105,20 +1095,20 @@ define amdgpu_kernel void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { ; SI-LABEL: v_shl_32_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_ashr_i32 s3, s2, 31 ; SI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[4:5], s[10:11] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[10:11], s[6:7] +; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_shl_32_i64: @@ -1172,34 +1162,34 @@ define amdgpu_kernel void @s_shl_constant_i64(i64 addrspace(1)* %out, i64 %a) { ; SI-LABEL: s_shl_constant_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s9, 0xffff -; SI-NEXT: s_mov_b32 s8, s2 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_lshl_b64 s[4:5], s[8:9], s6 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_lshl_b64 s[0:1], s[8:9], s2 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_shl_constant_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s9, 0xffff -; VI-NEXT: s_mov_b32 s8, s2 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_lshl_b64 s[4:5], s[8:9], s6 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_lshl_b64 s[0:1], s[8:9], s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_shl_constant_i64: @@ -1230,22 +1220,22 @@ define amdgpu_kernel void @v_shl_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { ; SI-LABEL: v_shl_constant_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s6, 0xab19b207 -; SI-NEXT: s_movk_i32 s7, 0x11e -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s2, 0xab19b207 +; SI-NEXT: s_movk_i32 s3, 0x11e +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshl_b64 v[0:1], s[6:7], v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: v_lshl_b64 v[0:1], s[2:3], v0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_shl_constant_i64: @@ -1301,21 +1291,21 @@ define amdgpu_kernel void @v_shl_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { ; SI-LABEL: v_shl_i64_32_bit_constant: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; SI-NEXT: s_mov_b64 s[6:7], 0x12d687 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b64 s[2:3], 0x12d687 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshl_b64 v[0:1], s[6:7], v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: v_lshl_b64 v[0:1], s[2:3], v0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_shl_i64_32_bit_constant: @@ -1369,36 +1359,34 @@ define amdgpu_kernel void @v_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { ; SI-LABEL: v_shl_inline_imm_64_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshl_b64 v[0:1], 64, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_shl_inline_imm_64_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_load_dword s0, s[2:3], 0x0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_load_dword s4, s[2:3], 0x0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], 64, s0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_lshl_b64 s[4:5], 64, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_shl_inline_imm_64_i64: @@ -1434,28 +1422,28 @@ define amdgpu_kernel void @s_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_64_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[0:1], 64, s0 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_lshl_b64 s[4:5], 64, s4 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_shl_inline_imm_64_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], 64, s0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_lshl_b64 s[4:5], 64, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_shl_inline_imm_64_i64: @@ -1484,28 +1472,28 @@ define amdgpu_kernel void @s_shl_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_1_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[0:1], 1, s0 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_lshl_b64 s[4:5], 1, s4 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_shl_inline_imm_1_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], 1, s0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_lshl_b64 s[4:5], 1, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_shl_inline_imm_1_i64: @@ -1535,28 +1523,28 @@ define amdgpu_kernel void @s_shl_inline_imm_1_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_1_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[0:1], 1.0, s0 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_lshl_b64 s[4:5], 1.0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_shl_inline_imm_1_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], 1.0, s0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_lshl_b64 s[4:5], 1.0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_shl_inline_imm_1_0_i64: @@ -1582,28 +1570,28 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_1_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_neg_1_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[0:1], -1.0, s0 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_lshl_b64 s[4:5], -1.0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_shl_inline_imm_neg_1_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], -1.0, s0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_lshl_b64 s[4:5], -1.0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_shl_inline_imm_neg_1_0_i64: @@ -1629,28 +1617,28 @@ define amdgpu_kernel void @s_shl_inline_imm_0_5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_0_5_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[0:1], 0.5, s0 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_lshl_b64 s[4:5], 0.5, s4 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_shl_inline_imm_0_5_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], 0.5, s0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_lshl_b64 s[4:5], 0.5, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_shl_inline_imm_0_5_i64: @@ -1676,28 +1664,28 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_0_5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_neg_0_5_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[0:1], -0.5, s0 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_lshl_b64 s[4:5], -0.5, s4 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_shl_inline_imm_neg_0_5_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], -0.5, s0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_lshl_b64 s[4:5], -0.5, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_shl_inline_imm_neg_0_5_i64: @@ -1723,28 +1711,28 @@ define amdgpu_kernel void @s_shl_inline_imm_2_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_2_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[0:1], 2.0, s0 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_lshl_b64 s[4:5], 2.0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_shl_inline_imm_2_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], 2.0, s0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_lshl_b64 s[4:5], 2.0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_shl_inline_imm_2_0_i64: @@ -1770,28 +1758,28 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_2_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_neg_2_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[0:1], -2.0, s0 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_lshl_b64 s[4:5], -2.0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_shl_inline_imm_neg_2_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], -2.0, s0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_lshl_b64 s[4:5], -2.0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_shl_inline_imm_neg_2_0_i64: @@ -1817,28 +1805,28 @@ define amdgpu_kernel void @s_shl_inline_imm_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_4_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[0:1], 4.0, s0 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_lshl_b64 s[4:5], 4.0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_shl_inline_imm_4_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], 4.0, s0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_lshl_b64 s[4:5], 4.0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_shl_inline_imm_4_0_i64: @@ -1864,28 +1852,28 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_neg_4_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[0:1], -4.0, s0 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_lshl_b64 s[4:5], -4.0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_shl_inline_imm_neg_4_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], -4.0, s0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_lshl_b64 s[4:5], -4.0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_shl_inline_imm_neg_4_0_i64: diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -7,9 +7,9 @@ define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { ; GFX9-LABEL: s_shl_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x30 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -41,21 +41,21 @@ ; ; CI-LABEL: s_shl_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; CI-NEXT: s_load_dword s2, s[0:1], 0xb -; CI-NEXT: s_load_dword s0, s[0:1], 0xc -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_load_dword s4, s[0:1], 0xb +; CI-NEXT: s_load_dword s5, s[0:1], 0xc +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s1, s2, 16 -; CI-NEXT: s_lshr_b32 s3, s0, 16 -; CI-NEXT: s_lshl_b32 s1, s1, s3 -; CI-NEXT: s_lshl_b32 s0, s2, s0 -; CI-NEXT: s_lshl_b32 s1, s1, 16 -; CI-NEXT: s_and_b32 s0, s0, 0xffff -; CI-NEXT: s_or_b32 s0, s0, s1 -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: s_lshr_b32 s6, s4, 16 +; CI-NEXT: s_lshr_b32 s7, s5, 16 +; CI-NEXT: s_lshl_b32 s6, s6, s7 +; CI-NEXT: s_lshl_b32 s4, s4, s5 +; CI-NEXT: s_lshl_b32 s6, s6, 16 +; CI-NEXT: s_and_b32 s4, s4, 0xffff +; CI-NEXT: s_or_b32 s4, s4, s6 +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; GFX10-LABEL: s_shl_v2i16: @@ -112,16 +112,16 @@ ; ; CI-LABEL: v_shl_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[0:1], s[6:7] -; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 -; CI-NEXT: s_mov_b64 s[6:7], s[2:3] +; CI-NEXT: s_mov_b64 s[4:5], s[2:3] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 +; CI-NEXT: s_mov_b64 s[2:3], s[6:7] ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -131,7 +131,7 @@ ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 -; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm ; ; GFX10-LABEL: v_shl_v2i16: @@ -350,15 +350,15 @@ ; ; CI-LABEL: shl_imm_v_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[0:1], s[6:7] -; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: s_mov_b64 s[6:7], s[2:3] +; CI-NEXT: s_mov_b64 s[4:5], s[2:3] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: s_mov_b64 s[2:3], s[6:7] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; CI-NEXT: v_lshl_b32_e32 v2, 8, v2 @@ -366,7 +366,7 @@ ; CI-NEXT: v_and_b32_e32 v2, 0xfff8, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 -; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm ; ; GFX10-LABEL: shl_imm_v_v2i16: @@ -423,19 +423,19 @@ ; ; CI-LABEL: shl_v_imm_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[0:1], s[6:7] -; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: s_mov_b64 s[6:7], s[2:3] +; CI-NEXT: s_mov_b64 s[4:5], s[2:3] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: s_mov_b64 s[2:3], s[6:7] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2 -; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm ; ; GFX10-LABEL: shl_v_imm_v2i16: @@ -499,17 +499,17 @@ ; ; CI-LABEL: v_shl_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[0:1], s[6:7] -; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 -; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 -; CI-NEXT: s_mov_b32 s0, 0xffff -; CI-NEXT: s_mov_b64 s[6:7], s[2:3] +; CI-NEXT: s_mov_b64 s[4:5], s[2:3] +; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 +; CI-NEXT: s_mov_b32 s4, 0xffff +; CI-NEXT: s_mov_b64 s[2:3], s[6:7] ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 @@ -520,13 +520,13 @@ ; CI-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; CI-NEXT: v_lshlrev_b32_e32 v4, v9, v7 ; CI-NEXT: v_lshlrev_b32_e32 v5, v8, v6 -; CI-NEXT: v_and_b32_e32 v3, s0, v3 -; CI-NEXT: v_and_b32_e32 v2, s0, v2 +; CI-NEXT: v_and_b32_e32 v3, s4, v3 +; CI-NEXT: v_and_b32_e32 v2, s4, v2 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; CI-NEXT: v_or_b32_e32 v3, v3, v4 ; CI-NEXT: v_or_b32_e32 v2, v2, v5 -; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm ; ; GFX10-LABEL: v_shl_v4i16: @@ -594,26 +594,26 @@ ; ; CI-LABEL: shl_v_imm_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[0:1], s[6:7] -; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 -; CI-NEXT: s_mov_b32 s0, 0xff00 -; CI-NEXT: s_mov_b64 s[6:7], s[2:3] +; CI-NEXT: s_mov_b64 s[4:5], s[2:3] +; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; CI-NEXT: s_mov_b32 s4, 0xff00 +; CI-NEXT: s_mov_b64 s[2:3], s[6:7] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v4, 8, v3 ; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; CI-NEXT: v_and_b32_e32 v4, s0, v4 +; CI-NEXT: v_and_b32_e32 v4, s4, v4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; CI-NEXT: v_and_b32_e32 v3, s0, v3 +; CI-NEXT: v_and_b32_e32 v3, s4, v3 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; CI-NEXT: v_or_b32_e32 v3, v3, v4 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2 -; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm ; ; GFX10-LABEL: shl_v_imm_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll --- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll +++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll @@ -1,12 +1,12 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s ; GCN-LABEL: {{^}}shl_base_atomicrmw_global_ptr: -; GCN: v_add_co_u32_e32 v[[EXTRA_LO:[0-9]+]], vcc, 0x80, v4 -; GCN: v_addc_co_u32_e32 v[[EXTRA_HI:[0-9]+]], vcc, 0, v5, vcc -; GCN: v_lshlrev_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, 2, v[4:5] -; GCN: v_mov_b32_e32 [[THREE:v[0-9]+]], 3 -; GCN: global_atomic_and v{{\[}}[[LO]]:[[HI]]{{\]}}, [[THREE]], off offset:512 -; GCN: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[EXTRA_LO]]:[[EXTRA_HI]]{{\]}} +; GCN-DAG: v_add_co_u32_e32 v[[EXTRA_LO:[0-9]+]], vcc, 0x80, v4 +; GCN-DAG: v_addc_co_u32_e32 v[[EXTRA_HI:[0-9]+]], vcc, 0, v5, vcc +; GCN-DAG: v_lshlrev_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, 2, v[4:5] +; GCN-DAG: v_mov_b32_e32 [[THREE:v[0-9]+]], 3 +; GCN-DAG: global_atomic_and v{{\[}}[[LO]]:[[HI]]{{\]}}, [[THREE]], off offset:512 +; GCN-DAG: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[EXTRA_LO]]:[[EXTRA_HI]]{{\]}} define void @shl_base_atomicrmw_global_ptr(i32 addrspace(1)* %out, i64 addrspace(1)* %extra.use, [512 x i32] addrspace(1)* %ptr) #0 { %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(1)* %ptr, i64 0, i64 32 %cast = ptrtoint i32 addrspace(1)* %arrayidx0 to i64 @@ -18,12 +18,12 @@ } ; GCN-LABEL: {{^}}shl_base_global_ptr_global_atomic_fadd: -; GCN: v_add_co_u32_e32 v[[EXTRA_LO:[0-9]+]], vcc, 0x80, v4 -; GCN: v_addc_co_u32_e32 v[[EXTRA_HI:[0-9]+]], vcc, 0, v5, vcc -; GCN: v_lshlrev_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, 2, v[4:5] -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x42c80000 -; GCN: global_atomic_add_f32 v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]], off offset:512 -; GCN: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[EXTRA_LO]]:[[EXTRA_HI]]{{\]}} +; GCN-DAG: v_add_co_u32_e32 v[[EXTRA_LO:[0-9]+]], vcc, 0x80, v4 +; GCN-DAG: v_addc_co_u32_e32 v[[EXTRA_HI:[0-9]+]], vcc, 0, v5, vcc +; GCN-DAG: v_lshlrev_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, 2, v[4:5] +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x42c80000 +; GCN-DAG: global_atomic_add_f32 v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]], off offset:512 +; GCN-DAG: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[EXTRA_LO]]:[[EXTRA_HI]]{{\]}} define void @shl_base_global_ptr_global_atomic_fadd(i32 addrspace(1)* %out, i64 addrspace(1)* %extra.use, [512 x i32] addrspace(1)* %ptr) #0 { %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(1)* %ptr, i64 0, i64 32 %cast = ptrtoint i32 addrspace(1)* %arrayidx0 to i64 diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -32,13 +32,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_subrev_u32_e32 v0, vcc, 64, v0 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i32_x_sub_64: @@ -189,13 +189,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_sub_u32_e32 v0, vcc, 64, v0 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_sub_u32_e32 v2, vcc, 64, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i32_64_sub_x: @@ -254,13 +254,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 0xffffffbf, v0 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0xffffffbf, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i32_x_sub_65: @@ -319,13 +319,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_sub_u32_e32 v0, vcc, 0x41, v0 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_sub_u32_e32 v2, vcc, 0x41, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i32_65_sub_x: @@ -384,13 +384,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i32_x_sub_neg16: @@ -449,13 +449,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_sub_u32_e32 v0, vcc, -16, v0 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_sub_u32_e32 v2, vcc, -16, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i32_neg16_sub_x: @@ -514,13 +514,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 17, v0 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 17, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i32_x_sub_neg17: @@ -579,13 +579,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_sub_u32_e32 v0, vcc, 0xffffffef, v0 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_sub_u32_e32 v2, vcc, 0xffffffef, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i32_neg17_sub_x: @@ -689,13 +689,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_ushort v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_subrev_u16_e32 v0, 64, v0 -; VI-NEXT: flat_store_short v[2:3], v0 +; VI-NEXT: v_subrev_u16_e32 v2, 64, v3 +; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i16_x_sub_64: @@ -757,13 +757,13 @@ ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v0 -; VI-NEXT: flat_load_ushort v0, v[1:2] -; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; VI-NEXT: flat_load_ushort v2, v[1:2] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_subrev_u16_e32 v0, 64, v0 -; VI-NEXT: flat_store_dword v[3:4], v0 +; VI-NEXT: v_subrev_u16_e32 v2, 64, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i16_x_sub_64_zext_to_i32: @@ -1204,15 +1204,15 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, -16 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v2, -16 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_sub_0_16: @@ -1271,15 +1271,15 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, 0x3c00 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v2, 0x3c00 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_sub_0_1_0: @@ -1339,15 +1339,15 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, 0xffffbc00 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v2, 0xffffbc00 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_sub_0_neg1_0: @@ -1479,15 +1479,15 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, 32 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v2, 32 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_sub_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_0_neg32: @@ -1687,15 +1687,15 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, -16 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v2, -16 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_0_neg16: @@ -2116,14 +2116,14 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, 32 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v2, 32 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_undef_neg32: @@ -2183,13 +2183,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_subrev_u16_e32 v0, 32, v0 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_subrev_u16_e32 v2, 32, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_neg32_undef: diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll --- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll @@ -5,50 +5,50 @@ define amdgpu_kernel void @break_inserted_outside_of_loop(i32 addrspace(1)* %out, i32 %a) { ; SI-LABEL: break_inserted_outside_of_loop: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, s0, v0 +; SI-NEXT: v_and_b32_e32 v0, s2, v0 ; SI-NEXT: v_and_b32_e32 v0, 1, v0 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; SI-NEXT: s_mov_b64 s[0:1], 0 +; SI-NEXT: s_mov_b64 s[2:3], 0 ; SI-NEXT: .LBB0_1: ; %ENDIF ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: s_and_b64 s[2:3], exec, vcc -; SI-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SI-NEXT: s_and_b64 s[4:5], exec, vcc +; SI-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] +; SI-NEXT: s_andn2_b64 exec, exec, s[2:3] ; SI-NEXT: s_cbranch_execnz .LBB0_1 ; SI-NEXT: ; %bb.2: ; %ENDLOOP -; SI-NEXT: s_or_b64 exec, exec, s[0:1] -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: break_inserted_outside_of_loop: ; FLAT: ; %bb.0: ; %main_body -; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; FLAT-NEXT: s_load_dword s0, s[0:1], 0x2c +; FLAT-NEXT: s_load_dword s2, s[0:1], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; FLAT-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_and_b32_e32 v0, s0, v0 +; FLAT-NEXT: v_and_b32_e32 v0, s2, v0 ; FLAT-NEXT: v_and_b32_e32 v0, 1, v0 ; FLAT-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; FLAT-NEXT: s_mov_b64 s[0:1], 0 +; FLAT-NEXT: s_mov_b64 s[2:3], 0 ; FLAT-NEXT: .LBB0_1: ; %ENDIF ; FLAT-NEXT: ; =>This Inner Loop Header: Depth=1 -; FLAT-NEXT: s_and_b64 s[2:3], exec, vcc -; FLAT-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; FLAT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; FLAT-NEXT: s_and_b64 s[4:5], exec, vcc +; FLAT-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] +; FLAT-NEXT: s_andn2_b64 exec, exec, s[2:3] ; FLAT-NEXT: s_cbranch_execnz .LBB0_1 ; FLAT-NEXT: ; %bb.2: ; %ENDLOOP -; FLAT-NEXT: s_or_b64 exec, exec, s[0:1] -; FLAT-NEXT: s_mov_b32 s7, 0xf000 -; FLAT-NEXT: s_mov_b32 s6, -1 +; FLAT-NEXT: s_or_b64 exec, exec, s[2:3] +; FLAT-NEXT: s_mov_b32 s3, 0xf000 +; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: v_mov_b32_e32 v0, 0 -; FLAT-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; FLAT-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; FLAT-NEXT: s_endpgm main_body: %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll --- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -160,9 +160,9 @@ } ; GCN-LABEL: {{^}}reorder_global_load_local_store_global_load: -; CI: ds_write_b32 ; CI: buffer_load_dword ; CI: buffer_load_dword +; CI: ds_write_b32 ; CI: buffer_store_dword ; GFX9: global_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 @@ -291,8 +291,8 @@ } ; GCN-LABEL: {{^}}reorder_local_load_tbuffer_store_local_load: -; GCN: tbuffer_store_format ; GCN: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:1 offset1:2 +; GCN: tbuffer_store_format define amdgpu_vs void @reorder_local_load_tbuffer_store_local_load(i32 addrspace(1)* %out, i32 %a1, i32 %vaddr) #0 { %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll --- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll @@ -5,28 +5,28 @@ define amdgpu_kernel void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_sext_i1_to_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s0, s1 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_cmp_eq_u32 s4, s5 +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sext_i1_to_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s0, s1 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_cmp_eq_u32 s4, s5 +; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %cmp = icmp eq i32 %a, %b %sext = sext i1 %cmp to i32 @@ -37,34 +37,34 @@ define amdgpu_kernel void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind { ; SI-LABEL: test_s_sext_i32_to_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; SI-NEXT: s_load_dword s0, s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dword s6, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mul_i32 s1, s2, s3 -; SI-NEXT: s_add_i32 s1, s1, s0 -; SI-NEXT: s_ashr_i32 s0, s1, 31 -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mul_i32 s4, s4, s5 +; SI-NEXT: s_add_i32 s4, s4, s6 +; SI-NEXT: s_ashr_i32 s5, s4, 31 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_s_sext_i32_to_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_load_dword s6, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mul_i32 s1, s2, s3 -; VI-NEXT: s_add_i32 s1, s1, s0 -; VI-NEXT: s_ashr_i32 s0, s1, 31 -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_mul_i32 s4, s4, s5 +; VI-NEXT: s_add_i32 s4, s4, s6 +; VI-NEXT: s_ashr_i32 s5, s4, 31 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm entry: %mul = mul i32 %a, %b @@ -77,30 +77,30 @@ define amdgpu_kernel void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_sext_i1_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s0, s1 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; SI-NEXT: s_cmp_eq_u32 s4, s5 +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] ; SI-NEXT: v_mov_b32_e32 v1, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sext_i1_to_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s0, s1 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; VI-NEXT: s_cmp_eq_u32 s4, s5 +; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] ; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %cmp = icmp eq i32 %a, %b %sext = sext i1 %cmp to i64 @@ -111,28 +111,28 @@ define amdgpu_kernel void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind { ; SI-LABEL: s_sext_i32_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ashr_i32 s1, s0, 31 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_ashr_i32 s5, s4, 31 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sext_i32_to_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s1, s0, 31 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_ashr_i32 s5, s4, 31 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %sext = sext i32 %a to i64 store i64 %sext, i64 addrspace(1)* %out, align 8 @@ -142,38 +142,38 @@ define amdgpu_kernel void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { ; SI-LABEL: v_sext_i32_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_sext_i32_to_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm %val = load i32, i32 addrspace(1)* %in, align 4 %sext = sext i32 %val to i64 @@ -184,28 +184,28 @@ define amdgpu_kernel void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind { ; SI-LABEL: s_sext_i16_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sext_i16_to_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %sext = sext i16 %a to i64 store i64 %sext, i64 addrspace(1)* %out, align 8 @@ -215,28 +215,28 @@ define amdgpu_kernel void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_sext_i1_to_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s0, s1 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_cmp_eq_u32 s4, s5 +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sext_i1_to_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s0, s1 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_cmp_eq_u32 s4, s5 +; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %cmp = icmp eq i32 %a, %b %sext = sext i1 %cmp to i16 @@ -251,34 +251,34 @@ define amdgpu_kernel void @s_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind { ; SI-LABEL: s_sext_i1_to_i16_with_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s0, s1 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: s_cmp_eq_u32 s2, s3 -; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; SI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_cmp_eq_u32 s4, s5 +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: s_cmp_eq_u32 s6, s7 +; SI-NEXT: s_cselect_b64 s[6:7], -1, 0 +; SI-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sext_i1_to_i16_with_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s0, s1 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s2, s3 -; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_cmp_eq_u32 s4, s5 +; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s6, s7 +; VI-NEXT: s_cselect_b64 s[6:7], -1, 0 +; VI-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %cmp0 = icmp eq i32 %a, %b %cmp1 = icmp eq i32 %c, %d @@ -291,34 +291,34 @@ define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind { ; SI-LABEL: v_sext_i1_to_i16_with_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; SI-NEXT: s_load_dword s0, s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dword s6, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s2, v0 -; SI-NEXT: s_cmp_eq_u32 s3, s0 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 +; SI-NEXT: s_cmp_eq_u32 s5, s6 +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_sext_i1_to_i16_with_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_load_dword s6, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s2, v0 -; VI-NEXT: s_cmp_eq_u32 s3, s0 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 +; VI-NEXT: s_cmp_eq_u32 s5, s6 +; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; VI-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %cmp0 = icmp eq i32 %a, %tid @@ -340,51 +340,51 @@ define amdgpu_kernel void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a) nounwind { ; SI-LABEL: s_sext_v4i8_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ashr_i32 s1, s0, 24 -; SI-NEXT: s_bfe_i32 s2, s0, 0x80010 -; SI-NEXT: s_bfe_i32 s3, s0, 0x80008 -; SI-NEXT: s_sext_i32_i8 s0, s0 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_ashr_i32 s5, s4, 24 +; SI-NEXT: s_bfe_i32 s6, s4, 0x80010 +; SI-NEXT: s_bfe_i32 s7, s4, 0x80008 +; SI-NEXT: s_sext_i32_i8 s4, s4 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sext_v4i8_to_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_lshrrev_b16_e64 v0, 8, s0 -; VI-NEXT: s_ashr_i32 s1, s0, 24 -; VI-NEXT: s_bfe_i32 s2, s0, 0x80010 -; VI-NEXT: s_sext_i32_i8 s0, s0 +; VI-NEXT: v_lshrrev_b16_e64 v0, 8, s4 +; VI-NEXT: s_ashr_i32 s5, s4, 24 +; VI-NEXT: s_bfe_i32 s6, s4, 0x80010 +; VI-NEXT: s_sext_i32_i8 s4, s4 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 8 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm %cast = bitcast i32 %a to <4 x i8> @@ -405,58 +405,58 @@ define amdgpu_kernel void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { ; SI-LABEL: v_sext_v4i8_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ashrrev_i32_e32 v1, 24, v0 ; SI-NEXT: v_bfe_i32 v2, v0, 16, 8 ; SI-NEXT: v_bfe_i32 v3, v0, 8, 8 ; SI-NEXT: v_bfe_i32 v0, v0, 0, 8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_sext_v4i8_to_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v0 ; VI-NEXT: v_ashrrev_i32_e32 v2, 24, v0 ; VI-NEXT: v_bfe_i32 v3, v0, 16, 8 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 8 ; VI-NEXT: v_bfe_i32 v1, v1, 0, 8 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm %a = load i32, i32 addrspace(1)* %in @@ -477,53 +477,53 @@ define amdgpu_kernel void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a) nounwind { ; SI-LABEL: s_sext_v4i16_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_ashr_i64 s[4:5], s[6:7], 48 -; SI-NEXT: s_ashr_i32 s5, s6, 16 -; SI-NEXT: s_sext_i32_i16 s6, s6 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: s_sext_i32_i16 s7, s7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_ashr_i64 s[0:1], s[2:3], 48 +; SI-NEXT: s_ashr_i32 s1, s2, 16 +; SI-NEXT: s_sext_i32_i16 s2, s2 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: s_sext_i32_i16 s3, s3 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sext_v4i16_to_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_ashr_i32 s5, s6, 16 -; VI-NEXT: s_sext_i32_i16 s6, s6 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_ashr_i32 s4, s7, 16 -; VI-NEXT: s_sext_i32_i16 s7, s7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_ashr_i32 s1, s2, 16 +; VI-NEXT: s_sext_i32_i16 s2, s2 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_ashr_i32 s0, s3, 16 +; VI-NEXT: s_sext_i32_i16 s3, s3 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm %cast = bitcast i64 %a to <4 x i16> @@ -542,57 +542,57 @@ define amdgpu_kernel void @v_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { ; SI-LABEL: v_sext_v4i16_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ashr_i64 v[2:3], v[0:1], 48 ; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v0 ; SI-NEXT: v_bfe_i32 v0, v0, 0, 16 ; SI-NEXT: v_bfe_i32 v1, v1, 0, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_sext_v4i16_to_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 ; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ashrrev_i32_e32 v3, 16, v0 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 ; VI-NEXT: v_ashrrev_i32_e32 v2, 16, v1 ; VI-NEXT: v_bfe_i32 v1, v1, 0, 16 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm %a = load i64, i64 addrspace(1)* %in diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll --- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll @@ -275,35 +275,35 @@ ; ; GFX8-LABEL: s_sint_to_fp_v2i64_to_v2f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_xor_b32 s7, s2, s3 -; GFX8-NEXT: s_flbit_i32 s6, s3 -; GFX8-NEXT: s_ashr_i32 s7, s7, 31 -; GFX8-NEXT: s_add_i32 s6, s6, -1 -; GFX8-NEXT: s_add_i32 s7, s7, 32 -; GFX8-NEXT: s_min_u32 s6, s6, s7 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s6 +; GFX8-NEXT: s_xor_b32 s3, s6, s7 +; GFX8-NEXT: s_flbit_i32 s2, s7 +; GFX8-NEXT: s_ashr_i32 s3, s3, 31 +; GFX8-NEXT: s_add_i32 s2, s2, -1 +; GFX8-NEXT: s_add_i32 s3, s3, 32 +; GFX8-NEXT: s_min_u32 s9, s2, s3 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s9 ; GFX8-NEXT: s_min_u32 s2, s2, 1 ; GFX8-NEXT: s_or_b32 s2, s3, s2 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX8-NEXT: s_xor_b32 s2, s0, s1 -; GFX8-NEXT: s_flbit_i32 s8, s1 +; GFX8-NEXT: s_xor_b32 s2, s4, s5 +; GFX8-NEXT: s_flbit_i32 s8, s5 ; GFX8-NEXT: s_ashr_i32 s2, s2, 31 ; GFX8-NEXT: s_add_i32 s8, s8, -1 ; GFX8-NEXT: s_add_i32 s2, s2, 32 -; GFX8-NEXT: s_min_u32 s2, s8, s2 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 -; GFX8-NEXT: s_min_u32 s0, s0, 1 -; GFX8-NEXT: s_or_b32 s0, s1, s0 -; GFX8-NEXT: v_cvt_f32_i32_e32 v2, s0 -; GFX8-NEXT: s_sub_i32 s0, 32, s6 -; GFX8-NEXT: v_ldexp_f32 v1, v0, s0 -; GFX8-NEXT: s_sub_i32 s0, 32, s2 -; GFX8-NEXT: v_ldexp_f32 v0, v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: s_min_u32 s6, s8, s2 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], s6 +; GFX8-NEXT: s_min_u32 s2, s2, 1 +; GFX8-NEXT: s_or_b32 s2, s3, s2 +; GFX8-NEXT: v_cvt_f32_i32_e32 v2, s2 +; GFX8-NEXT: s_sub_i32 s2, 32, s9 +; GFX8-NEXT: v_ldexp_f32 v1, v0, s2 +; GFX8-NEXT: s_sub_i32 s2, 32, s6 +; GFX8-NEXT: v_ldexp_f32 v0, v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm %result = sitofp <2 x i64> %in to <2 x float> @@ -388,24 +388,24 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v1 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 16, v5 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v6, vcc -; GFX8-NEXT: flat_load_dwordx4 v[1:4], v[1:2] +; GFX8-NEXT: flat_load_dwordx4 v[1:4], v[5:6] +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 16, v5 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; GFX8-NEXT: flat_load_dwordx4 v[5:8], v[5:6] ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v10, s1 ; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_xor_b32_e32 v14, v3, v4 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_xor_b32_e32 v0, v7, v8 -; GFX8-NEXT: v_xor_b32_e32 v12, v5, v6 -; GFX8-NEXT: v_xor_b32_e32 v16, v1, v2 -; GFX8-NEXT: v_ffbh_i32_e32 v11, v8 -; GFX8-NEXT: v_ffbh_i32_e32 v13, v6 -; GFX8-NEXT: v_ffbh_i32_e32 v15, v4 -; GFX8-NEXT: v_ffbh_i32_e32 v17, v2 +; GFX8-NEXT: v_xor_b32_e32 v0, v3, v4 +; GFX8-NEXT: v_xor_b32_e32 v12, v1, v2 +; GFX8-NEXT: v_ffbh_i32_e32 v11, v4 +; GFX8-NEXT: v_ffbh_i32_e32 v13, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v14, v7, v8 +; GFX8-NEXT: v_xor_b32_e32 v16, v5, v6 +; GFX8-NEXT: v_ffbh_i32_e32 v15, v8 +; GFX8-NEXT: v_ffbh_i32_e32 v17, v6 ; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v12 ; GFX8-NEXT: v_ashrrev_i32_e32 v14, 31, v14 ; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v16 @@ -421,30 +421,30 @@ ; GFX8-NEXT: v_min_u32_e32 v11, v13, v12 ; GFX8-NEXT: v_min_u32_e32 v12, v15, v14 ; GFX8-NEXT: v_min_u32_e32 v13, v17, v16 -; GFX8-NEXT: v_lshlrev_b64 v[7:8], v0, v[7:8] +; GFX8-NEXT: v_lshlrev_b64 v[3:4], v0, v[3:4] ; GFX8-NEXT: v_sub_u32_e32 v14, vcc, 32, v0 -; GFX8-NEXT: v_lshlrev_b64 v[5:6], v11, v[5:6] -; GFX8-NEXT: v_lshlrev_b64 v[3:4], v12, v[3:4] -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v13, v[1:2] -; GFX8-NEXT: v_min_u32_e32 v7, 1, v7 -; GFX8-NEXT: v_min_u32_e32 v5, 1, v5 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v11, v[1:2] +; GFX8-NEXT: v_lshlrev_b64 v[7:8], v12, v[7:8] +; GFX8-NEXT: v_lshlrev_b64 v[5:6], v13, v[5:6] ; GFX8-NEXT: v_min_u32_e32 v3, 1, v3 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX8-NEXT: v_or_b32_e32 v7, v8, v7 -; GFX8-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX8-NEXT: v_min_u32_e32 v7, 1, v7 +; GFX8-NEXT: v_min_u32_e32 v5, 1, v5 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v7 -; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v8, v7 +; GFX8-NEXT: v_or_b32_e32 v4, v6, v5 ; GFX8-NEXT: v_cvt_f32_i32_e32 v3, v3 -; GFX8-NEXT: v_cvt_f32_i32_e32 v5, v0 -; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 32, v11 -; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 32, v12 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v13 -; GFX8-NEXT: v_ldexp_f32 v1, v1, v14 -; GFX8-NEXT: v_ldexp_f32 v0, v4, v11 -; GFX8-NEXT: v_ldexp_f32 v3, v3, v12 -; GFX8-NEXT: v_ldexp_f32 v2, v5, v2 +; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX8-NEXT: v_cvt_f32_i32_e32 v5, v1 +; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v4 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v11 +; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 32, v12 +; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 32, v13 +; GFX8-NEXT: v_ldexp_f32 v1, v3, v14 +; GFX8-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX8-NEXT: v_ldexp_f32 v3, v5, v11 +; GFX8-NEXT: v_ldexp_f32 v2, v4, v12 ; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[0:3] ; GFX8-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -505,22 +505,22 @@ ; GFX8-NEXT: s_ashr_i32 s3, s3, 31 ; GFX8-NEXT: s_add_i32 s2, s2, -1 ; GFX8-NEXT: s_add_i32 s3, s3, 32 -; GFX8-NEXT: s_min_u32 s9, s2, s3 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s9 +; GFX8-NEXT: s_min_u32 s8, s2, s3 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s8 ; GFX8-NEXT: s_min_u32 s2, s2, 1 ; GFX8-NEXT: s_or_b32 s2, s3, s2 +; GFX8-NEXT: s_xor_b32 s3, s4, s5 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX8-NEXT: s_xor_b32 s2, s4, s5 -; GFX8-NEXT: s_flbit_i32 s8, s5 -; GFX8-NEXT: s_ashr_i32 s2, s2, 31 -; GFX8-NEXT: s_add_i32 s8, s8, -1 -; GFX8-NEXT: s_add_i32 s2, s2, 32 -; GFX8-NEXT: s_min_u32 s7, s8, s2 +; GFX8-NEXT: s_flbit_i32 s2, s5 +; GFX8-NEXT: s_ashr_i32 s3, s3, 31 +; GFX8-NEXT: s_add_i32 s2, s2, -1 +; GFX8-NEXT: s_add_i32 s3, s3, 32 +; GFX8-NEXT: s_min_u32 s7, s2, s3 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], s7 ; GFX8-NEXT: s_min_u32 s2, s2, 1 ; GFX8-NEXT: s_or_b32 s2, s3, s2 ; GFX8-NEXT: v_cvt_f32_i32_e32 v1, s2 -; GFX8-NEXT: s_sub_i32 s6, 32, s9 +; GFX8-NEXT: s_sub_i32 s6, 32, s8 ; GFX8-NEXT: s_sub_i32 s2, 32, s7 ; GFX8-NEXT: v_ldexp_f32 v0, v0, s6 ; GFX8-NEXT: v_ldexp_f32 v1, v1, s2 @@ -621,22 +621,22 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v1 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 16, v5 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v6, vcc -; GFX8-NEXT: flat_load_dwordx4 v[1:4], v[1:2] +; GFX8-NEXT: flat_load_dwordx4 v[1:4], v[5:6] +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 16, v5 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; GFX8-NEXT: flat_load_dwordx4 v[5:8], v[5:6] ; GFX8-NEXT: v_mov_b32_e32 v10, s1 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_xor_b32_e32 v14, v3, v4 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_xor_b32_e32 v0, v7, v8 -; GFX8-NEXT: v_xor_b32_e32 v12, v5, v6 -; GFX8-NEXT: v_xor_b32_e32 v16, v1, v2 -; GFX8-NEXT: v_ffbh_i32_e32 v11, v8 -; GFX8-NEXT: v_ffbh_i32_e32 v13, v6 -; GFX8-NEXT: v_ffbh_i32_e32 v15, v4 -; GFX8-NEXT: v_ffbh_i32_e32 v17, v2 +; GFX8-NEXT: v_xor_b32_e32 v0, v3, v4 +; GFX8-NEXT: v_xor_b32_e32 v12, v1, v2 +; GFX8-NEXT: v_ffbh_i32_e32 v11, v4 +; GFX8-NEXT: v_ffbh_i32_e32 v13, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v14, v7, v8 +; GFX8-NEXT: v_xor_b32_e32 v16, v5, v6 +; GFX8-NEXT: v_ffbh_i32_e32 v15, v8 +; GFX8-NEXT: v_ffbh_i32_e32 v17, v6 ; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v12 ; GFX8-NEXT: v_ashrrev_i32_e32 v14, 31, v14 ; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v16 @@ -652,38 +652,38 @@ ; GFX8-NEXT: v_min_u32_e32 v11, v13, v12 ; GFX8-NEXT: v_min_u32_e32 v12, v15, v14 ; GFX8-NEXT: v_min_u32_e32 v13, v17, v16 -; GFX8-NEXT: v_lshlrev_b64 v[7:8], v0, v[7:8] +; GFX8-NEXT: v_lshlrev_b64 v[3:4], v0, v[3:4] ; GFX8-NEXT: v_sub_u32_e32 v14, vcc, 32, v0 -; GFX8-NEXT: v_lshlrev_b64 v[5:6], v11, v[5:6] -; GFX8-NEXT: v_lshlrev_b64 v[3:4], v12, v[3:4] -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v13, v[1:2] -; GFX8-NEXT: v_min_u32_e32 v7, 1, v7 -; GFX8-NEXT: v_min_u32_e32 v5, 1, v5 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v11, v[1:2] +; GFX8-NEXT: v_lshlrev_b64 v[7:8], v12, v[7:8] +; GFX8-NEXT: v_lshlrev_b64 v[5:6], v13, v[5:6] ; GFX8-NEXT: v_min_u32_e32 v3, 1, v3 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX8-NEXT: v_or_b32_e32 v7, v8, v7 -; GFX8-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX8-NEXT: v_min_u32_e32 v7, 1, v7 +; GFX8-NEXT: v_min_u32_e32 v5, 1, v5 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v7 -; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v8, v7 +; GFX8-NEXT: v_or_b32_e32 v4, v6, v5 ; GFX8-NEXT: v_cvt_f32_i32_e32 v3, v3 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 32, v11 -; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 32, v12 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v13 -; GFX8-NEXT: v_ldexp_f32 v1, v1, v14 -; GFX8-NEXT: v_ldexp_f32 v4, v4, v11 -; GFX8-NEXT: v_ldexp_f32 v3, v3, v12 +; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v4 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v11 +; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 32, v12 +; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 32, v13 +; GFX8-NEXT: v_ldexp_f32 v3, v3, v14 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX8-NEXT: v_cvt_f16_f32_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; GFX8-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX8-NEXT: v_ldexp_f32 v1, v1, v11 +; GFX8-NEXT: v_ldexp_f32 v2, v4, v12 ; GFX8-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; GFX8-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX8-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GFX8-NEXT: v_cvt_f16_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GFX8-NEXT: v_cvt_f16_f32_e32 v6, v2 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v9 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v10, vcc -; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v6, v5 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll --- a/llvm/test/CodeGen/AMDGPU/sra.ll +++ b/llvm/test/CodeGen/AMDGPU/sra.ll @@ -8,40 +8,40 @@ define amdgpu_kernel void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { ; SI-LABEL: ashr_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ashr_i32_e32 v1, v1, v3 ; SI-NEXT: v_ashr_i32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: ashr_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ashrrev_i32_e32 v1, v3, v1 ; VI-NEXT: v_ashrrev_i32_e32 v0, v2, v0 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: ashr_v2i32: @@ -73,46 +73,46 @@ define amdgpu_kernel void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { ; SI-LABEL: ashr_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ashr_i32_e32 v3, v3, v7 ; SI-NEXT: v_ashr_i32_e32 v2, v2, v6 ; SI-NEXT: v_ashr_i32_e32 v1, v1, v5 ; SI-NEXT: v_ashr_i32_e32 v0, v0, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: ashr_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ashrrev_i32_e32 v3, v7, v3 ; VI-NEXT: v_ashrrev_i32_e32 v2, v6, v2 ; VI-NEXT: v_ashrrev_i32_e32 v1, v5, v1 ; VI-NEXT: v_ashrrev_i32_e32 v0, v4, v0 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: ashr_v4i32: @@ -148,17 +148,17 @@ define amdgpu_kernel void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { ; SI-LABEL: ashr_v2i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfe_i32 v2, v0, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v0, 16, v0 @@ -168,27 +168,27 @@ ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: ashr_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 ; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ashrrev_i32_sdwa v2, sext(v1), sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; VI-NEXT: v_ashrrev_i32_sdwa v0, sext(v1), sext(v0) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: ashr_v2i16: @@ -234,18 +234,18 @@ define amdgpu_kernel void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { ; SI-LABEL: ashr_v4i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s6, 0xffff -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s2, 0xffff +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfe_i32 v4, v0, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v0, 16, v0 @@ -258,27 +258,27 @@ ; SI-NEXT: v_ashrrev_i32_e32 v0, v6, v0 ; SI-NEXT: v_ashrrev_i32_e32 v2, v2, v4 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v3, s6, v3 +; SI-NEXT: v_and_b32_e32 v3, s2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v2, s6, v2 +; SI-NEXT: v_and_b32_e32 v2, s2, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: ashr_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ashrrev_i32_sdwa v4, sext(v2), sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; VI-NEXT: v_ashrrev_i32_sdwa v0, sext(v2), sext(v0) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -286,7 +286,7 @@ ; VI-NEXT: v_ashrrev_i32_sdwa v1, sext(v3), sext(v1) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: ashr_v4i16: @@ -377,30 +377,30 @@ define amdgpu_kernel void @s_ashr_i64(i64 addrspace(1)* %out, i32 %in) { ; SI-LABEL: s_ashr_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ashr_i32 s1, s0, 31 -; SI-NEXT: s_ashr_i64 s[0:1], s[0:1], 8 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_ashr_i32 s5, s4, 31 +; SI-NEXT: s_ashr_i64 s[4:5], s[4:5], 8 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_ashr_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s1, s0, 31 -; VI-NEXT: s_ashr_i64 s[0:1], s[0:1], 8 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_ashr_i32 s5, s4, 31 +; VI-NEXT: s_ashr_i64 s[4:5], s[4:5], 8 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_ashr_i64: @@ -425,38 +425,38 @@ define amdgpu_kernel void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { ; SI-LABEL: ashr_i64_2: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], v2 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: ashr_i64_2: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ashrrev_i64 v[0:1], v2, v[0:1] -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: ashr_i64_2: @@ -495,42 +495,42 @@ define amdgpu_kernel void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { ; SI-LABEL: ashr_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ashr_i64 v[2:3], v[2:3], v6 ; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: ashr_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ashrrev_i64 v[2:3], v6, v[2:3] ; VI-NEXT: v_ashrrev_i64 v[0:1], v4, v[0:1] -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: ashr_v2i64: @@ -587,20 +587,20 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s8, s6 ; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 -; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 +; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_ashr_i64 v[2:3], v[2:3], v10 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_ashr_i64 v[2:3], v[2:3], v6 +; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_ashr_i64 v[6:7], v[6:7], v13 -; SI-NEXT: v_ashr_i64 v[4:5], v[4:5], v11 -; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], v8 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: v_ashr_i64 v[9:10], v[9:10], v13 +; SI-NEXT: v_ashr_i64 v[7:8], v[7:8], v11 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: ashr_v4i64: @@ -613,20 +613,20 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s8, s6 ; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 -; VI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48 +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 +; VI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 +; VI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_ashrrev_i64 v[2:3], v10, v[2:3] +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_ashrrev_i64 v[2:3], v6, v[2:3] +; VI-NEXT: v_ashrrev_i64 v[0:1], v4, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ashrrev_i64 v[6:7], v13, v[6:7] -; VI-NEXT: v_ashrrev_i64 v[4:5], v11, v[4:5] -; VI-NEXT: v_ashrrev_i64 v[0:1], v8, v[0:1] -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: v_ashrrev_i64 v[9:10], v13, v[9:10] +; VI-NEXT: v_ashrrev_i64 v[7:8], v11, v[7:8] +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: ashr_v4i64: @@ -696,34 +696,34 @@ define amdgpu_kernel void @s_ashr_32_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { ; SI-LABEL: s_ashr_32_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s2, s[0:1], 0x14 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x1d -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s6, s[0:1], 0x14 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x1d +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ashr_i32 s3, s2, 31 -; SI-NEXT: s_add_u32 s0, s2, s0 -; SI-NEXT: s_addc_u32 s1, s3, s1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_ashr_i32 s7, s6, 31 +; SI-NEXT: s_add_u32 s4, s6, s4 +; SI-NEXT: s_addc_u32 s5, s7, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_ashr_32_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s2, s[0:1], 0x50 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x74 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s6, s[0:1], 0x50 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s3, s2, 31 -; VI-NEXT: s_add_u32 s0, s2, s0 -; VI-NEXT: s_addc_u32 s1, s3, s1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_ashr_i32 s7, s6, 31 +; VI-NEXT: s_add_u32 s4, s6, s4 +; VI-NEXT: s_addc_u32 s5, s7, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_ashr_32_i64: @@ -756,10 +756,10 @@ ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[10:11], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4 ; SI-NEXT: s_mov_b64 s[4:5], s[0:1] -; SI-NEXT: s_mov_b64 s[0:1], s[2:3] -; SI-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 offset:4 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 @@ -768,20 +768,20 @@ ; VI-LABEL: v_ashr_32_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v0 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ashr_32_i64: @@ -815,34 +815,34 @@ define amdgpu_kernel void @s_ashr_63_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { ; SI-LABEL: s_ashr_63_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s2, s[0:1], 0x14 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x1d -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s6, s[0:1], 0x14 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x1d +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ashr_i32 s2, s2, 31 -; SI-NEXT: s_add_u32 s0, s2, s0 -; SI-NEXT: s_addc_u32 s1, s2, s1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_ashr_i32 s6, s6, 31 +; SI-NEXT: s_add_u32 s4, s6, s4 +; SI-NEXT: s_addc_u32 s5, s6, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_ashr_63_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s2, s[0:1], 0x50 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x74 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s6, s[0:1], 0x50 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s2, s2, 31 -; VI-NEXT: s_add_u32 s0, s2, s0 -; VI-NEXT: s_addc_u32 s1, s2, s1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_ashr_i32 s6, s6, 31 +; VI-NEXT: s_add_u32 s4, s6, s4 +; VI-NEXT: s_addc_u32 s5, s6, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_ashr_63_i64: @@ -875,10 +875,10 @@ ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[10:11], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4 ; SI-NEXT: s_mov_b64 s[4:5], s[0:1] -; SI-NEXT: s_mov_b64 s[0:1], s[2:3] -; SI-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 offset:4 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v2 ; SI-NEXT: v_mov_b32_e32 v3, v2 @@ -888,19 +888,19 @@ ; VI-LABEL: v_ashr_63_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v0 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v2, v[2:3] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ashrrev_i32_e32 v2, 31, v2 +; VI-NEXT: v_ashrrev_i32_e32 v2, 31, v3 ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -123,19 +123,19 @@ ; ; GCN-IR-LABEL: s_test_srem: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GCN-IR-NEXT: s_mov_b64 s[2:3], 0 +; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[6:7], 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[0:1], 0 -; GCN-IR-NEXT: s_flbit_i32_b32 s12, s0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[4:5], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[2:3], 0 +; GCN-IR-NEXT: s_flbit_i32_b32 s12, s4 ; GCN-IR-NEXT: s_or_b64 s[14:15], s[8:9], s[10:11] -; GCN-IR-NEXT: s_flbit_i32_b32 s10, s6 +; GCN-IR-NEXT: s_flbit_i32_b32 s10, s2 ; GCN-IR-NEXT: s_add_i32 s12, s12, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s8, s1 +; GCN-IR-NEXT: s_flbit_i32_b32 s8, s5 ; GCN-IR-NEXT: s_add_i32 s10, s10, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s11, s7 +; GCN-IR-NEXT: s_flbit_i32_b32 s11, s3 ; GCN-IR-NEXT: s_min_u32 s8, s12, s8 ; GCN-IR-NEXT: s_min_u32 s12, s10, s11 ; GCN-IR-NEXT: s_sub_u32 s10, s8, s12 @@ -156,31 +156,31 @@ ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[14:15], v[0:1] ; GCN-IR-NEXT: s_sub_i32 s10, 63, s10 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, vcc -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[6:7], s10 +; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[6:7], s14 -; GCN-IR-NEXT: s_add_u32 s16, s0, -1 -; GCN-IR-NEXT: s_addc_u32 s17, s1, -1 -; GCN-IR-NEXT: s_not_b64 s[2:3], s[8:9] +; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[2:3], s14 +; GCN-IR-NEXT: s_add_u32 s16, s4, -1 +; GCN-IR-NEXT: s_addc_u32 s17, s5, -1 +; GCN-IR-NEXT: s_not_b64 s[6:7], s[8:9] ; GCN-IR-NEXT: s_mov_b32 s13, s9 -; GCN-IR-NEXT: s_add_u32 s8, s2, s12 -; GCN-IR-NEXT: s_addc_u32 s9, s3, s9 +; GCN-IR-NEXT: s_add_u32 s8, s6, s12 +; GCN-IR-NEXT: s_addc_u32 s9, s7, s9 ; GCN-IR-NEXT: s_mov_b64 s[12:13], 0 -; GCN-IR-NEXT: s_mov_b32 s3, 0 +; GCN-IR-NEXT: s_mov_b32 s7, 0 ; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1 -; GCN-IR-NEXT: s_lshr_b32 s2, s11, 31 +; GCN-IR-NEXT: s_lshr_b32 s6, s11, 31 ; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 -; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[2:3] +; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[6:7] ; GCN-IR-NEXT: s_or_b64 s[10:11], s[12:13], s[10:11] -; GCN-IR-NEXT: s_sub_u32 s2, s16, s14 -; GCN-IR-NEXT: s_subb_u32 s2, s17, s15 -; GCN-IR-NEXT: s_ashr_i32 s12, s2, 31 +; GCN-IR-NEXT: s_sub_u32 s6, s16, s14 +; GCN-IR-NEXT: s_subb_u32 s6, s17, s15 +; GCN-IR-NEXT: s_ashr_i32 s12, s6, 31 ; GCN-IR-NEXT: s_mov_b32 s13, s12 -; GCN-IR-NEXT: s_and_b32 s2, s12, 1 -; GCN-IR-NEXT: s_and_b64 s[18:19], s[12:13], s[0:1] +; GCN-IR-NEXT: s_and_b32 s6, s12, 1 +; GCN-IR-NEXT: s_and_b64 s[18:19], s[12:13], s[4:5] ; GCN-IR-NEXT: s_sub_u32 s14, s14, s18 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: s_subb_u32 s15, s15, s19 @@ -188,33 +188,33 @@ ; GCN-IR-NEXT: s_add_u32 s8, s8, 1 ; GCN-IR-NEXT: s_addc_u32 s9, s9, 0 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] -; GCN-IR-NEXT: s_mov_b64 s[12:13], s[2:3] +; GCN-IR-NEXT: s_mov_b64 s[12:13], s[6:7] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 ; GCN-IR-NEXT: .LBB0_4: ; %Flow6 ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[10:11], 1 -; GCN-IR-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s3 +; GCN-IR-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 ; GCN-IR-NEXT: s_branch .LBB0_6 ; GCN-IR-NEXT: .LBB0_5: -; GCN-IR-NEXT: v_mov_b32_e32 v0, s7 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s3 ; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[14:15] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 ; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[14:15] ; GCN-IR-NEXT: .LBB0_6: ; %udiv-end -; GCN-IR-NEXT: v_mul_lo_u32 v1, s0, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v2, s0, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v3, s1, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v0, s0, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s4, v1 +; GCN-IR-NEXT: v_mul_hi_u32 v2, s4, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v3, s5, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v0, s4, v0 ; GCN-IR-NEXT: s_mov_b32 s11, 0xf000 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s7 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-IR-NEXT: s_mov_b32 s10, -1 -; GCN-IR-NEXT: s_mov_b32 s8, s4 -; GCN-IR-NEXT: s_mov_b32 s9, s5 +; GCN-IR-NEXT: s_mov_b32 s8, s0 +; GCN-IR-NEXT: s_mov_b32 s9, s1 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GCN-IR-NEXT: s_endpgm @@ -463,66 +463,66 @@ define amdgpu_kernel void @s_test_srem23_64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem23_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[6:7], s[6:7], 41 -; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], 41 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s6 -; GCN-NEXT: s_xor_b32 s1, s6, s0 -; GCN-NEXT: s_ashr_i32 s1, s1, 30 +; GCN-NEXT: s_ashr_i64 s[4:5], s[4:5], 41 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 41 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s2 +; GCN-NEXT: s_xor_b32 s3, s2, s4 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_or_b32 s1, s1, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_ashr_i32 s3, s3, 30 +; GCN-NEXT: s_or_b32 s3, s3, 1 +; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 23 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_srem23_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[6:7], 41 -; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[0:1], 41 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s6 -; GCN-IR-NEXT: s_xor_b32 s1, s6, s0 -; GCN-IR-NEXT: s_ashr_i32 s1, s1, 30 +; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[4:5], 41 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 41 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s2 +; GCN-IR-NEXT: s_xor_b32 s3, s2, s4 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_or_b32 s1, s1, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s1 -; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: s_ashr_i32 s3, s3, 30 +; GCN-IR-NEXT: s_or_b32 s3, s3, 1 +; GCN-IR-NEXT: v_mov_b32_e32 v3, s3 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s0 -; GCN-IR-NEXT: s_mov_b32 s0, s4 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 23 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr i64 %x, 41 %2 = ashr i64 %y, 41 @@ -534,66 +534,66 @@ define amdgpu_kernel void @s_test_srem24_64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem24_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 -; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], 40 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s6 -; GCN-NEXT: s_xor_b32 s1, s6, s0 -; GCN-NEXT: s_ashr_i32 s1, s1, 30 +; GCN-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s2 +; GCN-NEXT: s_xor_b32 s3, s2, s4 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_or_b32 s1, s1, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_ashr_i32 s3, s3, 30 +; GCN-NEXT: s_or_b32 s3, s3, 1 +; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_srem24_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 -; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[0:1], 40 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s6 -; GCN-IR-NEXT: s_xor_b32 s1, s6, s0 -; GCN-IR-NEXT: s_ashr_i32 s1, s1, 30 +; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s2 +; GCN-IR-NEXT: s_xor_b32 s3, s2, s4 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_or_b32 s1, s1, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s1 -; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: s_ashr_i32 s3, s3, 30 +; GCN-IR-NEXT: s_or_b32 s3, s3, 1 +; GCN-IR-NEXT: v_mov_b32_e32 v3, s3 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s0 -; GCN-IR-NEXT: s_mov_b32 s0, s4 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr i64 %x, 40 %2 = ashr i64 %y, 40 @@ -659,66 +659,66 @@ define amdgpu_kernel void @s_test_srem25_64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem25_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[6:7], s[6:7], 39 -; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], 39 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s6 -; GCN-NEXT: s_xor_b32 s1, s6, s0 -; GCN-NEXT: s_ashr_i32 s1, s1, 30 +; GCN-NEXT: s_ashr_i64 s[4:5], s[4:5], 39 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s2 +; GCN-NEXT: s_xor_b32 s3, s2, s4 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_or_b32 s1, s1, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_ashr_i32 s3, s3, 30 +; GCN-NEXT: s_or_b32 s3, s3, 1 +; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 25 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_srem25_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[6:7], 39 -; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[0:1], 39 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s6 -; GCN-IR-NEXT: s_xor_b32 s1, s6, s0 -; GCN-IR-NEXT: s_ashr_i32 s1, s1, 30 +; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[4:5], 39 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s2 +; GCN-IR-NEXT: s_xor_b32 s3, s2, s4 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_or_b32 s1, s1, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s1 -; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: s_ashr_i32 s3, s3, 30 +; GCN-IR-NEXT: s_or_b32 s3, s3, 1 +; GCN-IR-NEXT: v_mov_b32_e32 v3, s3 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s0 -; GCN-IR-NEXT: s_mov_b32 s0, s4 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 25 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr i64 %x, 39 %2 = ashr i64 %y, 39 @@ -730,66 +730,66 @@ define amdgpu_kernel void @s_test_srem31_64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem31_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[6:7], s[6:7], 33 -; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], 33 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s6 -; GCN-NEXT: s_xor_b32 s1, s6, s0 -; GCN-NEXT: s_ashr_i32 s1, s1, 30 +; GCN-NEXT: s_ashr_i64 s[4:5], s[4:5], 33 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s2 +; GCN-NEXT: s_xor_b32 s3, s2, s4 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_or_b32 s1, s1, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_ashr_i32 s3, s3, 30 +; GCN-NEXT: s_or_b32 s3, s3, 1 +; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 31 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_srem31_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[6:7], 33 -; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[0:1], 33 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s6 -; GCN-IR-NEXT: s_xor_b32 s1, s6, s0 -; GCN-IR-NEXT: s_ashr_i32 s1, s1, 30 +; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[4:5], 33 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s2 +; GCN-IR-NEXT: s_xor_b32 s3, s2, s4 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_or_b32 s1, s1, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s1 -; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: s_ashr_i32 s3, s3, 30 +; GCN-IR-NEXT: s_or_b32 s3, s3, 1 +; GCN-IR-NEXT: v_mov_b32_e32 v3, s3 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s0 -; GCN-IR-NEXT: s_mov_b32 s0, s4 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 31 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr i64 %x, 33 %2 = ashr i64 %y, 33 @@ -802,19 +802,19 @@ define amdgpu_kernel void @s_test_srem32_64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem32_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xe -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s7 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GCN-NEXT: s_xor_b32 s1, s7, s0 -; GCN-NEXT: s_ashr_i32 s1, s1, 30 -; GCN-NEXT: s_or_b32 s1, s1, 1 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s3 +; GCN-NEXT: s_xor_b32 s2, s3, s4 +; GCN-NEXT: s_ashr_i32 s2, s2, 30 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_or_b32 s2, s2, 1 +; GCN-NEXT: v_mov_b32_e32 v3, s2 +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 @@ -822,28 +822,28 @@ ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s7, v0 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s3, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_srem32_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s0, s[0:1], 0xe -; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_load_dword s4, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s7 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GCN-IR-NEXT: s_xor_b32 s1, s7, s0 -; GCN-IR-NEXT: s_ashr_i32 s1, s1, 30 -; GCN-IR-NEXT: s_or_b32 s1, s1, 1 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s3 +; GCN-IR-NEXT: s_xor_b32 s2, s3, s4 +; GCN-IR-NEXT: s_ashr_i32 s2, s2, 30 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s1 -; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: s_or_b32 s2, s2, 1 +; GCN-IR-NEXT: v_mov_b32_e32 v3, s2 +; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 @@ -851,11 +851,11 @@ ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s0 -; GCN-IR-NEXT: s_mov_b32 s0, s4 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s7, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s3, v0 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr i64 %x, 32 %2 = ashr i64 %y, 32 @@ -1123,34 +1123,34 @@ define amdgpu_kernel void @s_test_srem24_48(i48 addrspace(1)* %out, i48 %x, i48 %y) { ; GCN-LABEL: s_test_srem24_48: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s2, s[0:1], 0xb -; GCN-NEXT: s_load_dword s3, s[0:1], 0xc +; GCN-NEXT: s_load_dword s2, s[0:1], 0xc +; GCN-NEXT: s_load_dword s3, s[0:1], 0xe ; GCN-NEXT: s_load_dword s6, s[0:1], 0xd -; GCN-NEXT: s_load_dword s0, s[0:1], 0xe -; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: s_sext_i32_i16 s1, s3 +; GCN-NEXT: s_sext_i32_i16 s1, s2 +; GCN-NEXT: s_sext_i32_i16 s2, s3 ; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_sext_i32_i16 s0, s0 -; GCN-NEXT: v_alignbit_b32 v0, s0, v0, 24 +; GCN-NEXT: v_alignbit_b32 v0, s2, v0, 24 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v0 +; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: v_alignbit_b32 v2, s1, v2, 24 ; GCN-NEXT: v_cvt_f32_i32_e32 v3, v2 -; GCN-NEXT: v_xor_b32_e32 v5, v2, v0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v1 +; GCN-NEXT: v_xor_b32_e32 v5, v2, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v5, 30, v5 ; GCN-NEXT: v_or_b32_e32 v5, 1, v5 -; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 ; GCN-NEXT: v_trunc_f32_e32 v4, v4 ; GCN-NEXT: v_mad_f32 v3, -v4, v1, v3 ; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; GCN-NEXT: v_mul_lo_u32 v0, v1, v0 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v2, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -1160,36 +1160,36 @@ ; ; GCN-IR-LABEL: s_test_srem24_48: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xb ; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xc -; GCN-IR-NEXT: s_load_dword s6, s[0:1], 0xd -; GCN-IR-NEXT: s_load_dword s0, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dword s5, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-IR-NEXT: s_load_dword s4, s[0:1], 0xd +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sext_i32_i16 s3, s3 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[2:3], 24 -; GCN-IR-NEXT: s_sext_i32_i16 s7, s0 -; GCN-IR-NEXT: s_ashr_i32 s0, s3, 31 -; GCN-IR-NEXT: s_ashr_i32 s12, s7, 31 -; GCN-IR-NEXT: s_ashr_i64 s[10:11], s[6:7], 24 -; GCN-IR-NEXT: s_mov_b32 s1, s0 -; GCN-IR-NEXT: s_mov_b32 s13, s12 -; GCN-IR-NEXT: s_xor_b64 s[2:3], s[8:9], s[0:1] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[10:11], s[12:13] -; GCN-IR-NEXT: s_sub_u32 s2, s2, s0 -; GCN-IR-NEXT: s_subb_u32 s3, s3, s0 -; GCN-IR-NEXT: s_sub_u32 s6, s6, s12 -; GCN-IR-NEXT: s_subb_u32 s7, s7, s12 +; GCN-IR-NEXT: s_sext_i32_i16 s5, s5 +; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[2:3], 24 +; GCN-IR-NEXT: s_ashr_i32 s2, s3, 31 +; GCN-IR-NEXT: s_ashr_i32 s10, s5, 31 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 24 +; GCN-IR-NEXT: s_mov_b32 s3, s2 +; GCN-IR-NEXT: s_mov_b32 s11, s10 +; GCN-IR-NEXT: s_xor_b64 s[4:5], s[6:7], s[2:3] +; GCN-IR-NEXT: s_xor_b64 s[6:7], s[8:9], s[10:11] +; GCN-IR-NEXT: s_sub_u32 s4, s4, s2 +; GCN-IR-NEXT: s_subb_u32 s5, s5, s2 +; GCN-IR-NEXT: s_sub_u32 s6, s6, s10 +; GCN-IR-NEXT: s_subb_u32 s7, s7, s10 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[6:7], 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[2:3], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[4:5], 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: s_or_b64 s[16:17], s[10:11], s[12:13] ; GCN-IR-NEXT: s_flbit_i32_b32 s10, s6 -; GCN-IR-NEXT: s_flbit_i32_b32 s12, s2 +; GCN-IR-NEXT: s_flbit_i32_b32 s12, s4 ; GCN-IR-NEXT: s_add_i32 s10, s10, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s11, s7 ; GCN-IR-NEXT: s_add_i32 s12, s12, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s13, s3 +; GCN-IR-NEXT: s_flbit_i32_b32 s13, s5 ; GCN-IR-NEXT: s_min_u32 s10, s10, s11 ; GCN-IR-NEXT: s_min_u32 s14, s12, s13 ; GCN-IR-NEXT: s_sub_u32 s12, s10, s14 @@ -1210,10 +1210,10 @@ ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1] ; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, vcc -; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[2:3], s12 +; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[4:5], s12 ; GCN-IR-NEXT: s_cbranch_vccz .LBB9_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[2:3], s16 +; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[4:5], s16 ; GCN-IR-NEXT: s_add_u32 s18, s6, -1 ; GCN-IR-NEXT: s_addc_u32 s19, s7, -1 ; GCN-IR-NEXT: s_not_b64 s[8:9], s[10:11] @@ -1252,29 +1252,29 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 ; GCN-IR-NEXT: s_branch .LBB9_6 ; GCN-IR-NEXT: .LBB9_5: -; GCN-IR-NEXT: v_mov_b32_e32 v0, s3 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s5 ; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[16:17] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 ; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[16:17] ; GCN-IR-NEXT: .LBB9_6: ; %udiv-end ; GCN-IR-NEXT: v_mul_lo_u32 v1, s6, v1 ; GCN-IR-NEXT: v_mul_hi_u32 v2, s6, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v3, s7, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v0, s6, v0 -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s5 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc -; GCN-IR-NEXT: v_xor_b32_e32 v0, s0, v0 -; GCN-IR-NEXT: v_xor_b32_e32 v1, s1, v1 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s1 -; GCN-IR-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 +; GCN-IR-NEXT: v_xor_b32_e32 v0, s2, v0 +; GCN-IR-NEXT: v_xor_b32_e32 v1, s3, v1 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 +; GCN-IR-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc -; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 -; GCN-IR-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 +; GCN-IR-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr i48 %x, 24 %2 = ashr i48 %y, 24 @@ -2055,60 +2055,60 @@ define amdgpu_kernel void @s_test_srem24_k_den_i64(i64 addrspace(1)* %out, i64 %x) { ; GCN-LABEL: s_test_srem24_k_den_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s1, 0x46b6fe00 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s4, 0x46b6fe00 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s6 -; GCN-NEXT: s_ashr_i32 s0, s6, 30 -; GCN-NEXT: s_or_b32 s0, s0, 1 -; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GCN-NEXT: s_ashr_i32 s3, s2, 30 +; GCN-NEXT: s_or_b32 s3, s3, 1 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: v_mul_f32_e32 v2, 0x38331158, v0 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mad_f32 v0, -v2, s1, v0 +; GCN-NEXT: v_mad_f32 v0, -v2, s4, v0 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s1 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s4 ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; GCN-NEXT: s_movk_i32 s0, 0x5b7f +; GCN-NEXT: s_movk_i32 s3, 0x5b7f ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s3 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_srem24_k_den_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_mov_b32 s1, 0x46b6fe00 -; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b32 s4, 0x46b6fe00 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s6 -; GCN-IR-NEXT: s_ashr_i32 s0, s6, 30 -; GCN-IR-NEXT: s_or_b32 s0, s0, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s0 +; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GCN-IR-NEXT: s_ashr_i32 s3, s2, 30 +; GCN-IR-NEXT: s_or_b32 s3, s3, 1 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s3 ; GCN-IR-NEXT: v_mul_f32_e32 v2, 0x38331158, v0 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-IR-NEXT: v_mad_f32 v0, -v2, s1, v0 +; GCN-IR-NEXT: v_mad_f32 v0, -v2, s4, v0 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s1 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s4 ; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; GCN-IR-NEXT: s_movk_i32 s0, 0x5b7f +; GCN-IR-NEXT: s_movk_i32 s3, 0x5b7f ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s0 -; GCN-IR-NEXT: s_mov_b32 s0, s4 -; GCN-IR-NEXT: s_mov_b32 s1, s5 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s3 +; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %x.shr = ashr i64 %x, 40 %result = srem i64 %x.shr, 23423 diff --git a/llvm/test/CodeGen/AMDGPU/srl.ll b/llvm/test/CodeGen/AMDGPU/srl.ll --- a/llvm/test/CodeGen/AMDGPU/srl.ll +++ b/llvm/test/CodeGen/AMDGPU/srl.ll @@ -8,35 +8,33 @@ define amdgpu_kernel void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { ; SI-LABEL: lshr_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: lshr_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s0, s0, s1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_lshr_b32 s4, s4, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: lshr_i32: @@ -66,38 +64,36 @@ define amdgpu_kernel void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { ; SI-LABEL: lshr_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshr_b32_e32 v1, v1, v3 ; SI-NEXT: v_lshr_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: lshr_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x8 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s4, s9, s7 -; VI-NEXT: s_lshr_b32 s5, s8, s6 -; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s5, s5, s7 +; VI-NEXT: s_lshr_b32 s4, s4, s6 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -130,41 +126,39 @@ define amdgpu_kernel void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { ; SI-LABEL: lshr_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshr_b32_e32 v3, v3, v7 ; SI-NEXT: v_lshr_b32_e32 v2, v2, v6 ; SI-NEXT: v_lshr_b32_e32 v1, v1, v5 ; SI-NEXT: v_lshr_b32_e32 v0, v0, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: lshr_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x10 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x10 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s7, s11, s7 -; VI-NEXT: s_lshr_b32 s6, s10, s6 -; VI-NEXT: s_lshr_b32 s5, s9, s5 -; VI-NEXT: s_lshr_b32 s4, s8, s4 +; VI-NEXT: s_lshr_b32 s7, s7, s11 +; VI-NEXT: s_lshr_b32 s6, s6, s10 +; VI-NEXT: s_lshr_b32 s5, s5, s9 +; VI-NEXT: s_lshr_b32 s4, s4, s8 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 @@ -203,36 +197,34 @@ define amdgpu_kernel void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { ; SI-LABEL: lshr_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], v2 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: lshr_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: lshr_i64: @@ -277,48 +269,46 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s8, s6 ; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 -; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 +; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], v10 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], v6 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], v13 -; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], v11 -; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], v8 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], v13 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], v11 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: lshr_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s12 -; VI-NEXT: s_mov_b32 s1, s13 -; VI-NEXT: s_load_dwordx8 s[4:11], s[14:15], 0x0 -; VI-NEXT: s_load_dwordx8 s[12:19], s[14:15], 0x20 +; VI-NEXT: s_load_dwordx8 s[0:7], s[18:19], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[18:19], 0x20 +; VI-NEXT: s_mov_b32 s19, 0xf000 +; VI-NEXT: s_mov_b32 s18, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b64 s[10:11], s[10:11], s18 -; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], s16 ; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 ; VI-NEXT: s_lshr_b64 s[4:5], s[4:5], s12 -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; VI-NEXT: s_nop 0 +; VI-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 +; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], s8 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: lshr_v4i64: @@ -384,26 +374,26 @@ define amdgpu_kernel void @s_lshr_32_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) { ; SI-LABEL: s_lshr_32_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0x14 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0x14 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_lshr_32_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x50 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x50 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_lshr_32_i64: @@ -431,10 +421,10 @@ ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[10:11], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4 ; SI-NEXT: s_mov_b64 s[4:5], s[0:1] -; SI-NEXT: s_mov_b64 s[0:1], s[2:3] -; SI-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 offset:4 ; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 @@ -443,18 +433,18 @@ ; VI-LABEL: v_lshr_32_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v2, vcc +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v0, v[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/store-local.128.ll --- a/llvm/test/CodeGen/AMDGPU/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.128.ll @@ -34,15 +34,15 @@ ; ; GFX6-LABEL: store_lds_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: v_mov_b32_e32 v3, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v4, s0 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset1:1 ; GFX6-NEXT: s_endpgm ; @@ -458,15 +458,15 @@ ; ; GFX6-LABEL: store_lds_v4i32_align8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: v_mov_b32_e32 v3, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v4, s0 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset1:1 ; GFX6-NEXT: s_endpgm ; @@ -517,15 +517,15 @@ ; ; GFX6-LABEL: store_lds_v4i32_align16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: v_mov_b32_e32 v3, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v4, s0 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset1:1 ; GFX6-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/store-local.96.ll --- a/llvm/test/CodeGen/AMDGPU/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.96.ll @@ -7,26 +7,26 @@ define amdgpu_kernel void @store_lds_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: ds_write_b96 v3, v[0:2] ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, s4 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-NEXT: ds_write_b96 v3, v[0:2] ; GFX7-NEXT: s_endpgm ; @@ -427,26 +427,26 @@ define amdgpu_kernel void @store_lds_v3i32_align16(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: ds_write_b96 v3, v[0:2] ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, s4 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-NEXT: ds_write_b96 v3, v[0:2] ; GFX7-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -45,14 +45,14 @@ ; HAWAII-NEXT: v_mov_b32_e32 v0, s0 ; HAWAII-NEXT: v_mov_b32_e32 v1, s5 ; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 ; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 ; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 ; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_waitcnt lgkmcnt(0) ; HAWAII-NEXT: v_mov_b32_e32 v1, s0 -; HAWAII-NEXT: v_mov_b32_e32 v3, s2 ; HAWAII-NEXT: v_mov_b32_e32 v2, s1 +; HAWAII-NEXT: v_mov_b32_e32 v3, s2 ; HAWAII-NEXT: ds_write_b16 v1, v3 offset:4 ; HAWAII-NEXT: s_waitcnt vmcnt(0) ; HAWAII-NEXT: v_and_b32_e32 v0, 0x7f, v0 @@ -66,15 +66,15 @@ ; FIJI-NEXT: v_mov_b32_e32 v0, s0 ; FIJI-NEXT: v_mov_b32_e32 v1, s5 ; FIJI-NEXT: flat_load_ubyte v0, v[0:1] -; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 -; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 -; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc +; FIJI-NEXT: s_load_dword s0, s[4:5], 0xc +; FIJI-NEXT: s_load_dword s1, s[4:5], 0x0 +; FIJI-NEXT: s_load_dword s2, s[4:5], 0x8 ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: v_mov_b32_e32 v1, s0 -; FIJI-NEXT: v_mov_b32_e32 v3, s1 -; FIJI-NEXT: s_and_b32 s3, s2, 0xffff -; FIJI-NEXT: v_mov_b32_e32 v2, s2 +; FIJI-NEXT: s_and_b32 s3, s0, 0xffff +; FIJI-NEXT: v_mov_b32_e32 v1, s1 +; FIJI-NEXT: v_mov_b32_e32 v2, s0 +; FIJI-NEXT: v_mov_b32_e32 v3, s2 ; FIJI-NEXT: ds_write_b16 v1, v2 offset:4 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -88,14 +88,14 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_load_ubyte_d16_hi v0, v0, s[4:5] offset:14 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_and_b32 s3, s2, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_and_b32 s3, s0, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: ds_write_b16 v1, v2 offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_e32 v0, s3, v0 @@ -132,40 +132,40 @@ ; HAWAII-LABEL: local_store_i48: ; HAWAII: ; %bb.0: ; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 -; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 +; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x3 +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x2 ; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_waitcnt lgkmcnt(0) ; HAWAII-NEXT: v_mov_b32_e32 v0, s0 -; HAWAII-NEXT: v_mov_b32_e32 v1, s2 -; HAWAII-NEXT: ds_write_b16 v0, v1 offset:4 ; HAWAII-NEXT: v_mov_b32_e32 v1, s1 +; HAWAII-NEXT: ds_write_b16 v0, v1 offset:4 +; HAWAII-NEXT: v_mov_b32_e32 v1, s2 ; HAWAII-NEXT: ds_write_b32 v0, v1 ; HAWAII-NEXT: s_endpgm ; ; FIJI-LABEL: local_store_i48: ; FIJI: ; %bb.0: ; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 -; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 -; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc +; FIJI-NEXT: s_load_dword s1, s[4:5], 0xc +; FIJI-NEXT: s_load_dword s2, s[4:5], 0x8 ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) ; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: v_mov_b32_e32 v1, s2 -; FIJI-NEXT: ds_write_b16 v0, v1 offset:4 ; FIJI-NEXT: v_mov_b32_e32 v1, s1 +; FIJI-NEXT: ds_write_b16 v0, v1 offset:4 +; FIJI-NEXT: v_mov_b32_e32 v1, s2 ; FIJI-NEXT: ds_write_b32 v0, v1 ; FIJI-NEXT: s_endpgm ; ; GFX9-LABEL: local_store_i48: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX9-NEXT: s_load_dword s1, s[4:5], 0xc +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 ; GFX9-NEXT: ds_write_b32 v0, v2 ; GFX9-NEXT: s_endpgm @@ -190,14 +190,14 @@ define amdgpu_kernel void @local_store_i65(i65 addrspace(3)* %ptr, i65 %arg) #0 { ; HAWAII-LABEL: local_store_i65: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x4 +; HAWAII-NEXT: s_load_dword s3, s[4:5], 0x0 ; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 -; HAWAII-NEXT: s_load_dword s3, s[4:5], 0x4 ; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: v_mov_b32_e32 v2, s2 -; HAWAII-NEXT: s_and_b32 s3, s3, 1 -; HAWAII-NEXT: v_mov_b32_e32 v0, s3 +; HAWAII-NEXT: s_and_b32 s2, s2, 1 +; HAWAII-NEXT: v_mov_b32_e32 v2, s3 +; HAWAII-NEXT: v_mov_b32_e32 v0, s2 ; HAWAII-NEXT: ds_write_b8 v2, v0 offset:8 ; HAWAII-NEXT: v_mov_b32_e32 v0, s0 ; HAWAII-NEXT: v_mov_b32_e32 v1, s1 @@ -206,14 +206,14 @@ ; ; FIJI-LABEL: local_store_i65: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0 +; FIJI-NEXT: s_load_dword s2, s[4:5], 0x10 +; FIJI-NEXT: s_load_dword s3, s[4:5], 0x0 ; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; FIJI-NEXT: s_load_dword s3, s[4:5], 0x10 ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: v_mov_b32_e32 v2, s2 -; FIJI-NEXT: s_and_b32 s3, s3, 1 -; FIJI-NEXT: v_mov_b32_e32 v0, s3 +; FIJI-NEXT: s_and_b32 s2, s2, 1 +; FIJI-NEXT: v_mov_b32_e32 v2, s3 +; FIJI-NEXT: v_mov_b32_e32 v0, s2 ; FIJI-NEXT: ds_write_b8 v2, v0 offset:8 ; FIJI-NEXT: v_mov_b32_e32 v0, s0 ; FIJI-NEXT: v_mov_b32_e32 v1, s1 @@ -222,14 +222,14 @@ ; ; GFX9-LABEL: local_store_i65: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX9-NEXT: s_load_dword s3, s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: s_and_b32 s2, s2, 1 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_and_b32 s3, s3, 1 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: ds_write_b8 v2, v3 offset:8 ; GFX9-NEXT: ds_write_b64 v2, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll --- a/llvm/test/CodeGen/AMDGPU/sub.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.ll @@ -5,8 +5,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone speculatable ; GCN-LABEL: {{^}}s_sub_i32: -; GCN: s_load_dwordx2 ; GCN: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}} +; GCN: s_load_dwordx2 ; GCN: s_sub_i32 s{{[0-9]+}}, s[[A]], s[[B]] define amdgpu_kernel void @s_sub_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { %result = sub i32 %a, %b diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -8,19 +8,17 @@ ; GFX9-LABEL: v_test_sub_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: v_pk_sub_i16 v0, v1, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16: @@ -78,40 +76,40 @@ define amdgpu_kernel void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0, <2 x i16> addrspace(4)* %in1) #1 { ; GFX9-LABEL: s_test_sub_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s11, s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NEXT: v_pk_sub_i16 v0, s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s10 +; GFX9-NEXT: v_pk_sub_i16 v0, s11, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_test_sub_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s6, s[6:7], 0x0 +; VI-NEXT: s_load_dword s7, s[0:1], 0x0 ; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_load_dword s4, s[6:7], 0x0 -; VI-NEXT: s_load_dword s6, s[8:9], 0x0 ; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s5, s4, 16 -; VI-NEXT: s_lshr_b32 s7, s6, 16 -; VI-NEXT: s_sub_i32 s4, s4, s6 -; VI-NEXT: s_sub_i32 s5, s5, s7 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_lshr_b32 s4, s6, 16 +; VI-NEXT: s_lshr_b32 s5, s7, 16 +; VI-NEXT: s_sub_i32 s6, s6, s7 +; VI-NEXT: s_sub_i32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s6, 0xffff +; VI-NEXT: s_lshl_b32 s4, s4, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -167,34 +165,34 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 { ; GFX9-LABEL: s_test_sub_v2i16_kernarg: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x30 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x30 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_pk_sub_i16 v0, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_pk_sub_i16 v0, s3, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_test_sub_v2i16_kernarg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x30 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dword s5, s[0:1], 0x30 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s1, s2, 16 -; VI-NEXT: s_lshr_b32 s3, s0, 16 -; VI-NEXT: s_sub_i32 s1, s1, s3 -; VI-NEXT: s_sub_i32 s0, s2, s0 -; VI-NEXT: s_lshl_b32 s1, s1, 16 -; VI-NEXT: s_and_b32 s0, s0, 0xffff -; VI-NEXT: s_or_b32 s0, s0, s1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_lshr_b32 s6, s4, 16 +; VI-NEXT: s_lshr_b32 s7, s5, 16 +; VI-NEXT: s_sub_i32 s6, s6, s7 +; VI-NEXT: s_sub_i32 s4, s4, s5 +; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_sub_v2i16_kernarg: @@ -217,16 +215,14 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s4, 0x1c8007b ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s4, 0x1c8007b -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -276,16 +272,14 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_neg_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s4, 0xfc21fcb3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s4, 0xfc21fcb3 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -334,15 +328,13 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_inline_neg1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -391,15 +383,13 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, 32 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -448,35 +438,31 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_inline_fp_split: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s4, 1.0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s4, 1.0 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_inline_fp_split: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, 0xffffc080 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -612,16 +612,16 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s2, 0x4f7ffffe ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_u32_e32 v8, v4 -; SI-NEXT: v_cvt_f32_u32_e32 v10, v5 -; SI-NEXT: v_cvt_f32_u32_e32 v12, v6 -; SI-NEXT: v_cvt_f32_u32_e32 v14, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_u32_e32 v8, v0 +; SI-NEXT: v_cvt_f32_u32_e32 v10, v1 +; SI-NEXT: v_cvt_f32_u32_e32 v12, v2 +; SI-NEXT: v_cvt_f32_u32_e32 v14, v3 ; SI-NEXT: v_rcp_iflag_f32_e32 v8, v8 ; SI-NEXT: v_rcp_iflag_f32_e32 v10, v10 ; SI-NEXT: v_rcp_iflag_f32_e32 v12, v12 @@ -634,10 +634,10 @@ ; SI-NEXT: v_cvt_u32_f32_e32 v10, v10 ; SI-NEXT: v_cvt_u32_f32_e32 v12, v12 ; SI-NEXT: v_cvt_u32_f32_e32 v14, v14 -; SI-NEXT: v_sub_i32_e32 v9, vcc, 0, v4 -; SI-NEXT: v_sub_i32_e32 v11, vcc, 0, v5 -; SI-NEXT: v_sub_i32_e32 v13, vcc, 0, v6 -; SI-NEXT: v_sub_i32_e32 v15, vcc, 0, v7 +; SI-NEXT: v_sub_i32_e32 v9, vcc, 0, v0 +; SI-NEXT: v_sub_i32_e32 v11, vcc, 0, v1 +; SI-NEXT: v_sub_i32_e32 v13, vcc, 0, v2 +; SI-NEXT: v_sub_i32_e32 v15, vcc, 0, v3 ; SI-NEXT: v_mul_lo_u32 v9, v9, v8 ; SI-NEXT: v_mul_lo_u32 v11, v11, v10 ; SI-NEXT: v_mul_lo_u32 v13, v13, v12 @@ -650,49 +650,50 @@ ; SI-NEXT: v_add_i32_e32 v9, vcc, v11, v10 ; SI-NEXT: v_add_i32_e32 v10, vcc, v13, v12 ; SI-NEXT: v_add_i32_e32 v11, vcc, v15, v14 -; SI-NEXT: v_mul_hi_u32 v8, v0, v8 -; SI-NEXT: v_mul_hi_u32 v9, v1, v9 -; SI-NEXT: v_mul_hi_u32 v10, v2, v10 -; SI-NEXT: v_mul_hi_u32 v11, v3, v11 -; SI-NEXT: v_mul_lo_u32 v12, v8, v4 -; SI-NEXT: v_mul_lo_u32 v14, v9, v5 -; SI-NEXT: v_mul_lo_u32 v16, v10, v6 -; SI-NEXT: v_mul_lo_u32 v18, v11, v7 -; SI-NEXT: v_subrev_i32_e32 v0, vcc, v12, v0 -; SI-NEXT: v_subrev_i32_e32 v1, vcc, v14, v1 -; SI-NEXT: v_subrev_i32_e32 v2, vcc, v16, v2 -; SI-NEXT: v_subrev_i32_e32 v3, vcc, v18, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_hi_u32 v8, v4, v8 +; SI-NEXT: v_mul_hi_u32 v9, v5, v9 +; SI-NEXT: v_mul_hi_u32 v10, v6, v10 +; SI-NEXT: v_mul_hi_u32 v11, v7, v11 +; SI-NEXT: v_mul_lo_u32 v12, v8, v0 +; SI-NEXT: v_mul_lo_u32 v14, v9, v1 +; SI-NEXT: v_mul_lo_u32 v16, v10, v2 +; SI-NEXT: v_mul_lo_u32 v18, v11, v3 +; SI-NEXT: v_subrev_i32_e32 v4, vcc, v12, v4 +; SI-NEXT: v_subrev_i32_e32 v5, vcc, v14, v5 +; SI-NEXT: v_subrev_i32_e32 v6, vcc, v16, v6 +; SI-NEXT: v_subrev_i32_e32 v7, vcc, v18, v7 ; SI-NEXT: v_add_i32_e32 v13, vcc, 1, v8 ; SI-NEXT: v_add_i32_e32 v15, vcc, 1, v9 ; SI-NEXT: v_add_i32_e32 v17, vcc, 1, v10 ; SI-NEXT: v_add_i32_e32 v19, vcc, 1, v11 -; SI-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 -; SI-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v5 -; SI-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 -; SI-NEXT: v_cmp_ge_u32_e64 s[6:7], v3, v7 +; SI-NEXT: v_cmp_ge_u32_e64 s[0:1], v4, v0 +; SI-NEXT: v_cmp_ge_u32_e64 s[2:3], v5, v1 +; SI-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2 +; SI-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3 ; SI-NEXT: v_cndmask_b32_e64 v8, v8, v13, s[0:1] -; SI-NEXT: v_subrev_i32_e32 v12, vcc, v4, v0 +; SI-NEXT: v_subrev_i32_e32 v12, vcc, v0, v4 ; SI-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[2:3] -; SI-NEXT: v_subrev_i32_e32 v13, vcc, v5, v1 +; SI-NEXT: v_subrev_i32_e32 v13, vcc, v1, v5 ; SI-NEXT: v_cndmask_b32_e64 v10, v10, v17, s[4:5] -; SI-NEXT: v_subrev_i32_e32 v14, vcc, v6, v2 +; SI-NEXT: v_subrev_i32_e32 v14, vcc, v2, v6 ; SI-NEXT: v_cndmask_b32_e64 v11, v11, v19, s[6:7] -; SI-NEXT: v_subrev_i32_e32 v15, vcc, v7, v3 -; SI-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[0:1] +; SI-NEXT: v_subrev_i32_e32 v15, vcc, v3, v7 +; SI-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[0:1] ; SI-NEXT: v_add_i32_e32 v12, vcc, 1, v8 -; SI-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[2:3] +; SI-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[2:3] ; SI-NEXT: v_add_i32_e32 v13, vcc, 1, v9 -; SI-NEXT: v_cndmask_b32_e64 v2, v2, v14, s[4:5] +; SI-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[4:5] ; SI-NEXT: v_add_i32_e32 v14, vcc, 1, v10 -; SI-NEXT: v_cndmask_b32_e64 v3, v3, v15, s[6:7] +; SI-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[6:7] ; SI-NEXT: v_add_i32_e32 v15, vcc, 1, v11 -; SI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 +; SI-NEXT: v_cmp_ge_u32_e32 vcc, v4, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v8, v12, vcc -; SI-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 +; SI-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 ; SI-NEXT: v_cndmask_b32_e32 v1, v9, v13, vcc -; SI-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 +; SI-NEXT: v_cmp_ge_u32_e32 vcc, v6, v2 ; SI-NEXT: v_cndmask_b32_e32 v2, v10, v14, vcc -; SI-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 +; SI-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 ; SI-NEXT: v_cndmask_b32_e32 v3, v11, v15, vcc ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -707,16 +708,16 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s4, s2 ; VI-NEXT: s_mov_b32 s5, s3 -; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16 +; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 ; VI-NEXT: s_mov_b32 s2, 0x4f7ffffe ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_u32_e32 v8, v4 -; VI-NEXT: v_cvt_f32_u32_e32 v10, v5 -; VI-NEXT: v_cvt_f32_u32_e32 v12, v6 -; VI-NEXT: v_cvt_f32_u32_e32 v14, v7 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cvt_f32_u32_e32 v8, v0 +; VI-NEXT: v_cvt_f32_u32_e32 v10, v1 +; VI-NEXT: v_cvt_f32_u32_e32 v12, v2 +; VI-NEXT: v_cvt_f32_u32_e32 v14, v3 ; VI-NEXT: v_rcp_iflag_f32_e32 v8, v8 ; VI-NEXT: v_rcp_iflag_f32_e32 v10, v10 ; VI-NEXT: v_rcp_iflag_f32_e32 v12, v12 @@ -729,10 +730,10 @@ ; VI-NEXT: v_cvt_u32_f32_e32 v10, v10 ; VI-NEXT: v_cvt_u32_f32_e32 v12, v12 ; VI-NEXT: v_cvt_u32_f32_e32 v14, v14 -; VI-NEXT: v_sub_u32_e32 v9, vcc, 0, v4 -; VI-NEXT: v_sub_u32_e32 v11, vcc, 0, v5 -; VI-NEXT: v_sub_u32_e32 v13, vcc, 0, v6 -; VI-NEXT: v_sub_u32_e32 v15, vcc, 0, v7 +; VI-NEXT: v_sub_u32_e32 v9, vcc, 0, v0 +; VI-NEXT: v_sub_u32_e32 v11, vcc, 0, v1 +; VI-NEXT: v_sub_u32_e32 v13, vcc, 0, v2 +; VI-NEXT: v_sub_u32_e32 v15, vcc, 0, v3 ; VI-NEXT: v_mul_lo_u32 v9, v9, v8 ; VI-NEXT: v_mul_lo_u32 v11, v11, v10 ; VI-NEXT: v_mul_lo_u32 v13, v13, v12 @@ -745,49 +746,50 @@ ; VI-NEXT: v_add_u32_e32 v9, vcc, v11, v10 ; VI-NEXT: v_add_u32_e32 v10, vcc, v13, v12 ; VI-NEXT: v_add_u32_e32 v11, vcc, v15, v14 -; VI-NEXT: v_mul_hi_u32 v8, v0, v8 -; VI-NEXT: v_mul_hi_u32 v9, v1, v9 -; VI-NEXT: v_mul_hi_u32 v10, v2, v10 -; VI-NEXT: v_mul_hi_u32 v11, v3, v11 -; VI-NEXT: v_mul_lo_u32 v12, v8, v4 -; VI-NEXT: v_mul_lo_u32 v14, v9, v5 -; VI-NEXT: v_mul_lo_u32 v16, v10, v6 -; VI-NEXT: v_mul_lo_u32 v18, v11, v7 -; VI-NEXT: v_subrev_u32_e32 v0, vcc, v12, v0 -; VI-NEXT: v_subrev_u32_e32 v1, vcc, v14, v1 -; VI-NEXT: v_subrev_u32_e32 v2, vcc, v16, v2 -; VI-NEXT: v_subrev_u32_e32 v3, vcc, v18, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_hi_u32 v8, v4, v8 +; VI-NEXT: v_mul_hi_u32 v9, v5, v9 +; VI-NEXT: v_mul_hi_u32 v10, v6, v10 +; VI-NEXT: v_mul_hi_u32 v11, v7, v11 +; VI-NEXT: v_mul_lo_u32 v12, v8, v0 +; VI-NEXT: v_mul_lo_u32 v14, v9, v1 +; VI-NEXT: v_mul_lo_u32 v16, v10, v2 +; VI-NEXT: v_mul_lo_u32 v18, v11, v3 +; VI-NEXT: v_subrev_u32_e32 v4, vcc, v12, v4 +; VI-NEXT: v_subrev_u32_e32 v5, vcc, v14, v5 +; VI-NEXT: v_subrev_u32_e32 v6, vcc, v16, v6 +; VI-NEXT: v_subrev_u32_e32 v7, vcc, v18, v7 ; VI-NEXT: v_add_u32_e32 v13, vcc, 1, v8 ; VI-NEXT: v_add_u32_e32 v15, vcc, 1, v9 ; VI-NEXT: v_add_u32_e32 v17, vcc, 1, v10 ; VI-NEXT: v_add_u32_e32 v19, vcc, 1, v11 -; VI-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 -; VI-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v5 -; VI-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 -; VI-NEXT: v_cmp_ge_u32_e64 s[6:7], v3, v7 +; VI-NEXT: v_cmp_ge_u32_e64 s[0:1], v4, v0 +; VI-NEXT: v_cmp_ge_u32_e64 s[2:3], v5, v1 +; VI-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2 +; VI-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3 ; VI-NEXT: v_cndmask_b32_e64 v8, v8, v13, s[0:1] -; VI-NEXT: v_subrev_u32_e32 v12, vcc, v4, v0 +; VI-NEXT: v_subrev_u32_e32 v12, vcc, v0, v4 ; VI-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[2:3] -; VI-NEXT: v_subrev_u32_e32 v13, vcc, v5, v1 +; VI-NEXT: v_subrev_u32_e32 v13, vcc, v1, v5 ; VI-NEXT: v_cndmask_b32_e64 v10, v10, v17, s[4:5] -; VI-NEXT: v_subrev_u32_e32 v14, vcc, v6, v2 +; VI-NEXT: v_subrev_u32_e32 v14, vcc, v2, v6 ; VI-NEXT: v_cndmask_b32_e64 v11, v11, v19, s[6:7] -; VI-NEXT: v_subrev_u32_e32 v15, vcc, v7, v3 -; VI-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[0:1] +; VI-NEXT: v_subrev_u32_e32 v15, vcc, v3, v7 +; VI-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[0:1] ; VI-NEXT: v_add_u32_e32 v12, vcc, 1, v8 -; VI-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[2:3] +; VI-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[2:3] ; VI-NEXT: v_add_u32_e32 v13, vcc, 1, v9 -; VI-NEXT: v_cndmask_b32_e64 v2, v2, v14, s[4:5] +; VI-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[4:5] ; VI-NEXT: v_add_u32_e32 v14, vcc, 1, v10 -; VI-NEXT: v_cndmask_b32_e64 v3, v3, v15, s[6:7] +; VI-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[6:7] ; VI-NEXT: v_add_u32_e32 v15, vcc, 1, v11 -; VI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 +; VI-NEXT: v_cmp_ge_u32_e32 vcc, v4, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v8, v12, vcc -; VI-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 +; VI-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v9, v13, vcc -; VI-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 +; VI-NEXT: v_cmp_ge_u32_e32 vcc, v6, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v10, v14, vcc -; VI-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 +; VI-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v11, v15, vcc ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; VI-NEXT: s_endpgm @@ -796,22 +798,22 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_add_u32 s2, s2, 16 -; GCN-NEXT: s_addc_u32 s3, s3, 0 +; GCN-NEXT: s_add_u32 s4, s2, 16 +; GCN-NEXT: s_addc_u32 s5, s3, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NEXT: v_mov_b32_e32 v4, s2 -; GCN-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-NEXT: s_mov_b32 s2, 0x4f7ffffe ; GCN-NEXT: v_mov_b32_e32 v8, s0 ; GCN-NEXT: v_mov_b32_e32 v9, s1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_u32_e32 v10, v4 -; GCN-NEXT: v_cvt_f32_u32_e32 v12, v5 -; GCN-NEXT: v_cvt_f32_u32_e32 v14, v6 -; GCN-NEXT: v_cvt_f32_u32_e32 v16, v7 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_cvt_f32_u32_e32 v10, v0 +; GCN-NEXT: v_cvt_f32_u32_e32 v12, v1 +; GCN-NEXT: v_cvt_f32_u32_e32 v14, v2 +; GCN-NEXT: v_cvt_f32_u32_e32 v16, v3 ; GCN-NEXT: v_rcp_iflag_f32_e32 v10, v10 ; GCN-NEXT: v_rcp_iflag_f32_e32 v12, v12 ; GCN-NEXT: v_rcp_iflag_f32_e32 v14, v14 @@ -824,10 +826,10 @@ ; GCN-NEXT: v_cvt_u32_f32_e32 v12, v12 ; GCN-NEXT: v_cvt_u32_f32_e32 v14, v14 ; GCN-NEXT: v_cvt_u32_f32_e32 v16, v16 -; GCN-NEXT: v_sub_u32_e32 v11, vcc, 0, v4 -; GCN-NEXT: v_sub_u32_e32 v13, vcc, 0, v5 -; GCN-NEXT: v_sub_u32_e32 v15, vcc, 0, v6 -; GCN-NEXT: v_sub_u32_e32 v17, vcc, 0, v7 +; GCN-NEXT: v_sub_u32_e32 v11, vcc, 0, v0 +; GCN-NEXT: v_sub_u32_e32 v13, vcc, 0, v1 +; GCN-NEXT: v_sub_u32_e32 v15, vcc, 0, v2 +; GCN-NEXT: v_sub_u32_e32 v17, vcc, 0, v3 ; GCN-NEXT: v_mul_lo_u32 v11, v11, v10 ; GCN-NEXT: v_mul_lo_u32 v13, v13, v12 ; GCN-NEXT: v_mul_lo_u32 v15, v15, v14 @@ -840,49 +842,50 @@ ; GCN-NEXT: v_add_u32_e32 v11, vcc, v13, v12 ; GCN-NEXT: v_add_u32_e32 v12, vcc, v15, v14 ; GCN-NEXT: v_add_u32_e32 v13, vcc, v17, v16 -; GCN-NEXT: v_mul_hi_u32 v10, v0, v10 -; GCN-NEXT: v_mul_hi_u32 v11, v1, v11 -; GCN-NEXT: v_mul_hi_u32 v12, v2, v12 -; GCN-NEXT: v_mul_hi_u32 v13, v3, v13 -; GCN-NEXT: v_mul_lo_u32 v14, v10, v4 -; GCN-NEXT: v_mul_lo_u32 v16, v11, v5 -; GCN-NEXT: v_mul_lo_u32 v18, v12, v6 -; GCN-NEXT: v_mul_lo_u32 v19, v13, v7 -; GCN-NEXT: v_subrev_u32_e32 v0, vcc, v14, v0 -; GCN-NEXT: v_subrev_u32_e32 v1, vcc, v16, v1 -; GCN-NEXT: v_subrev_u32_e32 v2, vcc, v18, v2 -; GCN-NEXT: v_subrev_u32_e32 v3, vcc, v19, v3 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_hi_u32 v10, v4, v10 +; GCN-NEXT: v_mul_hi_u32 v11, v5, v11 +; GCN-NEXT: v_mul_hi_u32 v12, v6, v12 +; GCN-NEXT: v_mul_hi_u32 v13, v7, v13 +; GCN-NEXT: v_mul_lo_u32 v14, v10, v0 +; GCN-NEXT: v_mul_lo_u32 v16, v11, v1 +; GCN-NEXT: v_mul_lo_u32 v18, v12, v2 +; GCN-NEXT: v_mul_lo_u32 v19, v13, v3 +; GCN-NEXT: v_subrev_u32_e32 v4, vcc, v14, v4 +; GCN-NEXT: v_subrev_u32_e32 v5, vcc, v16, v5 +; GCN-NEXT: v_subrev_u32_e32 v6, vcc, v18, v6 +; GCN-NEXT: v_subrev_u32_e32 v7, vcc, v19, v7 ; GCN-NEXT: v_add_u32_e32 v15, vcc, 1, v10 ; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v11 ; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v12 ; GCN-NEXT: v_add_u32_e32 v16, vcc, 1, v13 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v5 -; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 -; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v3, v7 +; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v4, v0 +; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v5, v1 +; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2 +; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3 ; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v15, s[0:1] -; GCN-NEXT: v_subrev_u32_e32 v15, vcc, v4, v0 +; GCN-NEXT: v_subrev_u32_e32 v15, vcc, v0, v4 ; GCN-NEXT: v_cndmask_b32_e64 v11, v11, v17, s[2:3] -; GCN-NEXT: v_subrev_u32_e32 v17, vcc, v5, v1 +; GCN-NEXT: v_subrev_u32_e32 v17, vcc, v1, v5 ; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v14, s[4:5] -; GCN-NEXT: v_subrev_u32_e32 v14, vcc, v6, v2 +; GCN-NEXT: v_subrev_u32_e32 v14, vcc, v2, v6 ; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v16, s[6:7] -; GCN-NEXT: v_subrev_u32_e32 v16, vcc, v7, v3 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v15, s[0:1] +; GCN-NEXT: v_subrev_u32_e32 v16, vcc, v3, v7 +; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v15, s[0:1] ; GCN-NEXT: v_add_u32_e32 v15, vcc, 1, v10 -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[2:3] +; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v17, s[2:3] ; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v11 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v14, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[4:5] ; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v12 -; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v16, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v16, s[6:7] ; GCN-NEXT: v_add_u32_e32 v16, vcc, 1, v13 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v4, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v10, v15, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 ; GCN-NEXT: v_cndmask_b32_e32 v1, v11, v17, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v2 ; GCN-NEXT: v_cndmask_b32_e32 v2, v12, v14, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 ; GCN-NEXT: v_cndmask_b32_e32 v3, v13, v16, vcc ; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-NEXT: s_endpgm @@ -1068,47 +1071,47 @@ define amdgpu_kernel void @udiv_i32_div_pow2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { ; SI-LABEL: udiv_i32_div_pow2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v0, 4, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: udiv_i32_div_pow2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v0, 4, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GCN-LABEL: udiv_i32_div_pow2: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: v_mov_b32_e32 v3, s3 -; GCN-NEXT: flat_load_dword v2, v[2:3] +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -1153,42 +1156,42 @@ define amdgpu_kernel void @udiv_i32_div_k_even(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { ; SI-LABEL: udiv_i32_div_k_even: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s0, 0xfabbd9c1 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s2, 0xfabbd9c1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_hi_u32 v0, v0, s0 -; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: v_mul_hi_u32 v0, v0, s2 ; SI-NEXT: v_lshrrev_b32_e32 v0, 25, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: udiv_i32_div_k_even: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s0, 0xfabbd9c1 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s2, 0xfabbd9c1 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mul_hi_u32 v0, v0, s0 -; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: v_mul_hi_u32 v0, v0, s2 ; VI-NEXT: v_lshrrev_b32_e32 v0, 25, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GCN-LABEL: udiv_i32_div_k_even: @@ -1247,42 +1250,42 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { ; SI-LABEL: udiv_i32_div_k_odd: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s0, 0x7d5deca3 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s2, 0x7d5deca3 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_hi_u32 v0, v0, s0 -; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: v_mul_hi_u32 v0, v0, s2 ; SI-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: udiv_i32_div_k_odd: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s0, 0x7d5deca3 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s2, 0x7d5deca3 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mul_hi_u32 v0, v0, s0 -; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: v_mul_hi_u32 v0, v0, s2 ; VI-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GCN-LABEL: udiv_i32_div_k_odd: @@ -1341,60 +1344,60 @@ define amdgpu_kernel void @v_udiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { ; SI-LABEL: v_udiv_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 -; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:1 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1 +; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; SI-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 -; SI-NEXT: v_rcp_iflag_f32_e32 v2, v1 -; SI-NEXT: v_mul_f32_e32 v2, v0, v2 +; SI-NEXT: v_mul_f32_e32 v2, v1, v2 ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_cvt_u32_f32_e32 v3, v2 -; SI-NEXT: v_mad_f32 v0, -v2, v1, v0 -; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v1 +; SI-NEXT: v_mad_f32 v1, -v2, v0, v1 +; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; SI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_udiv_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 -; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:1 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1 +; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 -; VI-NEXT: v_rcp_iflag_f32_e32 v2, v1 -; VI-NEXT: v_mul_f32_e32 v2, v0, v2 +; VI-NEXT: v_mul_f32_e32 v2, v1, v2 ; VI-NEXT: v_trunc_f32_e32 v2, v2 ; VI-NEXT: v_cvt_u32_f32_e32 v3, v2 -; VI-NEXT: v_mad_f32 v0, -v2, v1, v0 -; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v1 +; VI-NEXT: v_mad_f32 v1, -v2, v0, v1 +; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc ; VI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GCN-LABEL: v_udiv_i8: @@ -1481,18 +1484,18 @@ define amdgpu_kernel void @v_udiv_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { ; SI-LABEL: v_udiv_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 offset:2 ; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_u32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1505,23 +1508,23 @@ ; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; SI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_udiv_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 offset:2 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f32_u32_e32 v0, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1534,7 +1537,7 @@ ; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc ; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GCN-LABEL: v_udiv_i16: @@ -1543,9 +1546,9 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s4, s2, 2 ; GCN-NEXT: s_addc_u32 s5, s3, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NEXT: flat_load_ushort v2, v[2:3] +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: flat_load_ushort v2, v[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: flat_load_ushort v0, v[0:1] @@ -1630,72 +1633,76 @@ define amdgpu_kernel void @v_udiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) { ; SI-LABEL: v_udiv_i23: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2 -; SI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4 -; SI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:6 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:6 +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4 +; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 +; SI-NEXT: buffer_load_ushort v3, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_cvt_f32_u32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f32_u32_e32 v0, v0 -; SI-NEXT: v_rcp_iflag_f32_e32 v1, v2 -; SI-NEXT: v_mul_f32_e32 v1, v0, v1 -; SI-NEXT: v_trunc_f32_e32 v1, v1 -; SI-NEXT: v_cvt_u32_f32_e32 v3, v1 -; SI-NEXT: v_mad_f32 v0, -v1, v2, v0 -; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_u32_e32 v1, v1 +; SI-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; SI-NEXT: v_mul_f32_e32 v2, v1, v2 +; SI-NEXT: v_trunc_f32_e32 v2, v2 +; SI-NEXT: v_cvt_u32_f32_e32 v3, v2 +; SI-NEXT: v_mad_f32 v1, -v2, v0, v1 +; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; SI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc ; SI-NEXT: v_and_b32_e32 v0, 0x7fffff, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_udiv_i23: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2 -; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4 -; VI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:6 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:6 +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4 +; VI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 +; VI-NEXT: buffer_load_ushort v3, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: v_cvt_f32_u32_e32 v2, v2 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_cvt_f32_u32_e32 v0, v0 -; VI-NEXT: v_rcp_iflag_f32_e32 v1, v2 -; VI-NEXT: v_mul_f32_e32 v1, v0, v1 -; VI-NEXT: v_trunc_f32_e32 v1, v1 -; VI-NEXT: v_cvt_u32_f32_e32 v3, v1 -; VI-NEXT: v_mad_f32 v0, -v1, v2, v0 -; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v1, v3, v1 +; VI-NEXT: v_cvt_f32_u32_e32 v1, v1 +; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; VI-NEXT: v_mul_f32_e32 v2, v1, v2 +; VI-NEXT: v_trunc_f32_e32 v2, v2 +; VI-NEXT: v_cvt_u32_f32_e32 v3, v2 +; VI-NEXT: v_mad_f32 v1, -v2, v0, v1 +; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc ; VI-NEXT: v_and_b32_e32 v0, 0x7fffff, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GCN-LABEL: v_udiv_i23: @@ -1705,28 +1712,30 @@ ; GCN-NEXT: s_add_u32 s4, s2, 4 ; GCN-NEXT: s_addc_u32 s5, s3, 0 ; GCN-NEXT: s_add_u32 s6, s2, 2 -; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_addc_u32 s7, s3, 0 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_add_u32 s2, s2, 6 -; GCN-NEXT: v_mov_b32_e32 v4, s6 -; GCN-NEXT: s_addc_u32 s3, s3, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v5, s7 -; GCN-NEXT: v_mov_b32_e32 v7, s3 -; GCN-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NEXT: v_mov_b32_e32 v6, s2 -; GCN-NEXT: flat_load_ubyte v4, v[4:5] -; GCN-NEXT: flat_load_ushort v0, v[0:1] -; GCN-NEXT: flat_load_ubyte v1, v[6:7] -; GCN-NEXT: flat_load_ushort v2, v[2:3] +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: s_add_u32 s6, s2, 6 +; GCN-NEXT: s_addc_u32 s7, s3, 0 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NEXT: v_mov_b32_e32 v5, s5 +; GCN-NEXT: flat_load_ubyte v6, v[2:3] +; GCN-NEXT: flat_load_ushort v4, v[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-NEXT: flat_load_ushort v1, v[2:3] +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_or_b32_e32 v2, v4, v2 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 ; GCN-NEXT: v_cvt_f32_u32_e32 v3, v0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -1829,30 +1838,32 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s8, s2 ; SI-NEXT: s_mov_b32 s9, s3 -; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 offset:4 -; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:6 +; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:6 +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4 +; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 ; SI-NEXT: buffer_load_ushort v3, off, s[8:11], 0 -; SI-NEXT: buffer_load_ubyte v4, off, s[8:11], 0 offset:2 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f32_u32_e32 v1, v0 -; SI-NEXT: v_sub_i32_e32 v2, vcc, 0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_sub_i32_e32 v4, vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; SI-NEXT: v_cvt_u32_f32_e32 v1, v1 -; SI-NEXT: v_mul_lo_u32 v2, v2, v1 -; SI-NEXT: v_mul_hi_u32 v2, v1, v2 -; SI-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; SI-NEXT: v_mul_hi_u32 v1, v3, v1 -; SI-NEXT: v_mul_lo_u32 v2, v1, v0 +; SI-NEXT: v_mul_lo_u32 v4, v4, v1 +; SI-NEXT: v_mul_hi_u32 v4, v1, v4 +; SI-NEXT: v_add_i32_e32 v1, vcc, v4, v1 +; SI-NEXT: v_mul_hi_u32 v1, v2, v1 +; SI-NEXT: v_mul_lo_u32 v3, v1, v0 ; SI-NEXT: v_add_i32_e32 v4, vcc, 1, v1 -; SI-NEXT: v_subrev_i32_e32 v2, vcc, v2, v3 +; SI-NEXT: v_subrev_i32_e32 v2, vcc, v3, v2 ; SI-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v0 ; SI-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] ; SI-NEXT: v_subrev_i32_e32 v3, vcc, v0, v2 @@ -1874,30 +1885,32 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s8, s2 ; VI-NEXT: s_mov_b32 s9, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 offset:4 -; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:6 +; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:6 +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4 +; VI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 ; VI-NEXT: buffer_load_ushort v3, off, s[8:11], 0 -; VI-NEXT: buffer_load_ubyte v4, off, s[8:11], 0 offset:2 ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_cvt_f32_u32_e32 v1, v0 -; VI-NEXT: v_sub_u32_e32 v2, vcc, 0, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_sub_u32_e32 v4, vcc, 0, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; VI-NEXT: v_or_b32_e32 v3, v3, v4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v2, v3, v2 ; VI-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; VI-NEXT: v_cvt_u32_f32_e32 v1, v1 -; VI-NEXT: v_mul_lo_u32 v2, v2, v1 -; VI-NEXT: v_mul_hi_u32 v2, v1, v2 -; VI-NEXT: v_add_u32_e32 v1, vcc, v2, v1 -; VI-NEXT: v_mul_hi_u32 v1, v3, v1 -; VI-NEXT: v_mul_lo_u32 v2, v1, v0 +; VI-NEXT: v_mul_lo_u32 v4, v4, v1 +; VI-NEXT: v_mul_hi_u32 v4, v1, v4 +; VI-NEXT: v_add_u32_e32 v1, vcc, v4, v1 +; VI-NEXT: v_mul_hi_u32 v1, v2, v1 +; VI-NEXT: v_mul_lo_u32 v3, v1, v0 ; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v1 -; VI-NEXT: v_subrev_u32_e32 v2, vcc, v2, v3 +; VI-NEXT: v_subrev_u32_e32 v2, vcc, v3, v2 ; VI-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v0 ; VI-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] ; VI-NEXT: v_subrev_u32_e32 v3, vcc, v0, v2 @@ -1923,43 +1936,43 @@ ; GCN-NEXT: s_addc_u32 s5, s3, 0 ; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NEXT: flat_load_ubyte v2, v[2:3] -; GCN-NEXT: flat_load_ushort v0, v[0:1] -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v0, v1 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, v4 -; GCN-NEXT: v_sub_u32_e32 v6, vcc, 0, v4 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v5, v2 +; GCN-NEXT: flat_load_ubyte v4, v[2:3] +; GCN-NEXT: flat_load_ushort v5, v[0:1] ; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: flat_load_ubyte v2, v[2:3] ; GCN-NEXT: flat_load_ushort v0, v[0:1] -; GCN-NEXT: v_mul_lo_u32 v6, v6, v5 -; GCN-NEXT: v_mul_hi_u32 v1, v5, v6 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_or_b32_e32 v3, v5, v1 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, v3 +; GCN-NEXT: v_sub_u32_e32 v4, vcc, 0, v3 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_or_b32_e32 v2, v0, v2 -; GCN-NEXT: v_add_u32_e32 v0, vcc, v1, v5 -; GCN-NEXT: v_mul_hi_u32 v3, v2, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: v_mul_lo_u32 v4, v4, v1 +; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 +; GCN-NEXT: v_add_u32_e32 v0, vcc, v4, v1 +; GCN-NEXT: v_mul_hi_u32 v4, v2, v0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_mul_lo_u32 v5, v3, v4 -; GCN-NEXT: v_add_u32_e32 v6, vcc, 1, v3 +; GCN-NEXT: v_mul_lo_u32 v5, v4, v3 +; GCN-NEXT: v_add_u32_e32 v6, vcc, 1, v4 ; GCN-NEXT: v_subrev_u32_e32 v2, vcc, v5, v2 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v4 -; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] -; GCN-NEXT: v_subrev_u32_e32 v5, vcc, v4, v2 +; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v3 +; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[0:1] +; GCN-NEXT: v_subrev_u32_e32 v5, vcc, v3, v2 ; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] -; GCN-NEXT: v_add_u32_e32 v5, vcc, 1, v3 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 -; GCN-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; GCN-NEXT: v_add_u32_e32 v5, vcc, 1, v4 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v3 +; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc ; GCN-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm @@ -2056,58 +2069,58 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) { ; SI-LABEL: scalarize_mulhu_4xi32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s4, 0x1389c755 -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s0, 0x1389c755 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 2, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 -; SI-NEXT: v_mul_hi_u32 v0, v0, s4 -; SI-NEXT: v_mul_hi_u32 v1, v1, s4 -; SI-NEXT: v_mul_hi_u32 v2, v2, s4 -; SI-NEXT: v_mul_hi_u32 v3, v3, s4 +; SI-NEXT: v_mul_hi_u32 v0, v0, s0 +; SI-NEXT: v_mul_hi_u32 v1, v1, s0 +; SI-NEXT: v_mul_hi_u32 v2, v2, s0 +; SI-NEXT: v_mul_hi_u32 v3, v3, s0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 10, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 10, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 10, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 10, v3 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: scalarize_mulhu_4xi32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -; VI-NEXT: s_mov_b32 s4, 0x1389c755 -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s0, 0x1389c755 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_lshrrev_b32_e32 v1, 2, v1 ; VI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 ; VI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 -; VI-NEXT: v_mul_hi_u32 v0, v0, s4 -; VI-NEXT: v_mul_hi_u32 v1, v1, s4 -; VI-NEXT: v_mul_hi_u32 v2, v2, s4 -; VI-NEXT: v_mul_hi_u32 v3, v3, s4 +; VI-NEXT: v_mul_hi_u32 v0, v0, s0 +; VI-NEXT: v_mul_hi_u32 v1, v1, s0 +; VI-NEXT: v_mul_hi_u32 v2, v2, s0 +; VI-NEXT: v_mul_hi_u32 v3, v3, s0 ; VI-NEXT: v_lshrrev_b32_e32 v0, 10, v0 ; VI-NEXT: v_lshrrev_b32_e32 v1, 10, v1 ; VI-NEXT: v_lshrrev_b32_e32 v2, 10, v2 ; VI-NEXT: v_lshrrev_b32_e32 v3, 10, v3 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GCN-LABEL: scalarize_mulhu_4xi32: @@ -2333,22 +2346,22 @@ ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s1, s0 ; SI-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 -; SI-NEXT: buffer_load_sbyte v2, off, s[0:3], 0 +; SI-NEXT: buffer_load_sbyte v1, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_i32_e32 v1, v0 +; SI-NEXT: v_cvt_f32_i32_e32 v2, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_i32_e32 v3, v2 -; SI-NEXT: v_xor_b32_e32 v0, v2, v0 +; SI-NEXT: v_cvt_f32_i32_e32 v3, v1 +; SI-NEXT: v_xor_b32_e32 v0, v1, v0 ; SI-NEXT: v_ashrrev_i32_e32 v0, 30, v0 -; SI-NEXT: v_rcp_iflag_f32_e32 v4, v1 +; SI-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; SI-NEXT: v_or_b32_e32 v0, 1, v0 -; SI-NEXT: v_mul_f32_e32 v2, v3, v4 -; SI-NEXT: v_trunc_f32_e32 v2, v2 -; SI-NEXT: v_mad_f32 v3, -v2, v1, v3 -; SI-NEXT: v_cvt_i32_f32_e32 v2, v2 -; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| +; SI-NEXT: v_mul_f32_e32 v1, v3, v4 +; SI-NEXT: v_trunc_f32_e32 v1, v1 +; SI-NEXT: v_mad_f32 v3, -v1, v2, v3 +; SI-NEXT: v_cvt_i32_f32_e32 v1, v1 +; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| ; SI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -2359,46 +2372,46 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s1, s0 ; VI-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 -; VI-NEXT: buffer_load_sbyte v2, off, s[0:3], 0 +; VI-NEXT: buffer_load_sbyte v1, off, s[0:3], 0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cvt_f32_i32_e32 v1, v0 +; VI-NEXT: v_cvt_f32_i32_e32 v2, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_i32_e32 v3, v2 -; VI-NEXT: v_xor_b32_e32 v0, v2, v0 +; VI-NEXT: v_cvt_f32_i32_e32 v3, v1 +; VI-NEXT: v_xor_b32_e32 v0, v1, v0 ; VI-NEXT: v_ashrrev_i32_e32 v0, 30, v0 -; VI-NEXT: v_rcp_iflag_f32_e32 v4, v1 +; VI-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; VI-NEXT: v_or_b32_e32 v0, 1, v0 -; VI-NEXT: v_mul_f32_e32 v2, v3, v4 -; VI-NEXT: v_trunc_f32_e32 v2, v2 -; VI-NEXT: v_mad_f32 v3, -v2, v1, v3 -; VI-NEXT: v_cvt_i32_f32_e32 v2, v2 -; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| +; VI-NEXT: v_mul_f32_e32 v1, v3, v4 +; VI-NEXT: v_trunc_f32_e32 v1, v1 +; VI-NEXT: v_mad_f32 v3, -v1, v2, v3 +; VI-NEXT: v_cvt_i32_f32_e32 v1, v1 +; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| ; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GCN-LABEL: fdiv_test_denormals: ; GCN: ; %bb.0: ; %bb +; GCN-NEXT: flat_load_sbyte v2, v[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: flat_load_sbyte v2, v[0:1] -; GCN-NEXT: flat_load_sbyte v4, v[0:1] +; GCN-NEXT: flat_load_sbyte v3, v[0:1] ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f32_i32_e32 v3, v2 +; GCN-NEXT: v_cvt_f32_i32_e32 v4, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_i32_e32 v5, v4 -; GCN-NEXT: v_xor_b32_e32 v2, v4, v2 +; GCN-NEXT: v_cvt_f32_i32_e32 v5, v3 +; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GCN-NEXT: v_xor_b32_e32 v2, v3, v2 ; GCN-NEXT: v_ashrrev_i32_e32 v2, 30, v2 -; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v3 ; GCN-NEXT: v_or_b32_e32 v2, 1, v2 -; GCN-NEXT: v_mul_f32_e32 v4, v5, v6 -; GCN-NEXT: v_trunc_f32_e32 v4, v4 -; GCN-NEXT: v_mad_f32 v5, -v4, v3, v5 -; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v3| +; GCN-NEXT: v_mul_f32_e32 v3, v5, v6 +; GCN-NEXT: v_trunc_f32_e32 v3, v3 +; GCN-NEXT: v_mad_f32 v5, -v3, v4, v5 +; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v4| ; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GCN-NEXT: v_add_u32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GCN-NEXT: flat_store_byte v[0:1], v2 ; GCN-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -124,20 +124,20 @@ ; ; GCN-IR-LABEL: s_test_udiv_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GCN-IR-NEXT: s_mov_b64 s[2:3], 0 +; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[6:7], 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[0:1], 0 -; GCN-IR-NEXT: s_flbit_i32_b32 s12, s0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[4:5], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[2:3], 0 +; GCN-IR-NEXT: s_flbit_i32_b32 s12, s4 ; GCN-IR-NEXT: s_add_i32 s12, s12, 32 ; GCN-IR-NEXT: s_or_b64 s[14:15], s[8:9], s[10:11] -; GCN-IR-NEXT: s_flbit_i32_b32 s8, s1 +; GCN-IR-NEXT: s_flbit_i32_b32 s8, s5 ; GCN-IR-NEXT: s_min_u32 s10, s12, s8 -; GCN-IR-NEXT: s_flbit_i32_b32 s8, s6 +; GCN-IR-NEXT: s_flbit_i32_b32 s8, s2 ; GCN-IR-NEXT: s_add_i32 s8, s8, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s9, s7 +; GCN-IR-NEXT: s_flbit_i32_b32 s9, s3 ; GCN-IR-NEXT: s_min_u32 s12, s8, s9 ; GCN-IR-NEXT: s_sub_u32 s8, s10, s12 ; GCN-IR-NEXT: s_subb_u32 s9, 0, 0 @@ -157,56 +157,56 @@ ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[14:15], v[0:1] ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, vcc -; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[6:7], s8 +; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[6:7], s14 -; GCN-IR-NEXT: s_add_u32 s16, s0, -1 -; GCN-IR-NEXT: s_addc_u32 s17, s1, -1 +; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[2:3], s14 +; GCN-IR-NEXT: s_add_u32 s16, s4, -1 +; GCN-IR-NEXT: s_addc_u32 s17, s5, -1 ; GCN-IR-NEXT: s_not_b64 s[2:3], s[10:11] -; GCN-IR-NEXT: s_add_u32 s6, s2, s12 +; GCN-IR-NEXT: s_add_u32 s2, s2, s12 ; GCN-IR-NEXT: s_mov_b32 s13, s11 -; GCN-IR-NEXT: s_addc_u32 s7, s3, s11 +; GCN-IR-NEXT: s_addc_u32 s3, s3, s11 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 -; GCN-IR-NEXT: s_mov_b32 s3, 0 +; GCN-IR-NEXT: s_mov_b32 s7, 0 ; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[14:15], 1 -; GCN-IR-NEXT: s_lshr_b32 s2, s9, 31 +; GCN-IR-NEXT: s_lshr_b32 s6, s9, 31 ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[2:3] +; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[6:7] ; GCN-IR-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] -; GCN-IR-NEXT: s_sub_u32 s2, s16, s12 -; GCN-IR-NEXT: s_subb_u32 s2, s17, s13 -; GCN-IR-NEXT: s_ashr_i32 s10, s2, 31 +; GCN-IR-NEXT: s_sub_u32 s6, s16, s12 +; GCN-IR-NEXT: s_subb_u32 s6, s17, s13 +; GCN-IR-NEXT: s_ashr_i32 s10, s6, 31 ; GCN-IR-NEXT: s_mov_b32 s11, s10 -; GCN-IR-NEXT: s_and_b32 s2, s10, 1 -; GCN-IR-NEXT: s_and_b64 s[14:15], s[10:11], s[0:1] +; GCN-IR-NEXT: s_and_b32 s6, s10, 1 +; GCN-IR-NEXT: s_and_b64 s[14:15], s[10:11], s[4:5] ; GCN-IR-NEXT: s_sub_u32 s14, s12, s14 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 ; GCN-IR-NEXT: s_subb_u32 s15, s13, s15 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 -; GCN-IR-NEXT: s_add_u32 s6, s6, 1 -; GCN-IR-NEXT: s_addc_u32 s7, s7, 0 -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] -; GCN-IR-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-IR-NEXT: v_mov_b32_e32 v1, s3 +; GCN-IR-NEXT: s_add_u32 s2, s2, 1 +; GCN-IR-NEXT: s_addc_u32 s3, s3, 0 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] +; GCN-IR-NEXT: s_mov_b64 s[10:11], s[6:7] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 ; GCN-IR-NEXT: .LBB0_4: ; %Flow6 -; GCN-IR-NEXT: s_lshl_b64 s[0:1], s[8:9], 1 -; GCN-IR-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s1 +; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[8:9], 1 +; GCN-IR-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3] +; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s3 ; GCN-IR-NEXT: s_branch .LBB0_6 ; GCN-IR-NEXT: .LBB0_5: -; GCN-IR-NEXT: v_mov_b32_e32 v0, s7 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s3 ; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[14:15] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 ; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[14:15] ; GCN-IR-NEXT: .LBB0_6: ; %udiv-end -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %result = udiv i64 %x, %y store i64 %result, i64 addrspace(1)* %out @@ -419,18 +419,18 @@ define amdgpu_kernel void @s_test_udiv24_64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_udiv24_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xe -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_lshr_b32 s0, s0, 8 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GCN-NEXT: s_lshr_b32 s0, s7, 8 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_lshr_b32 s2, s4, 8 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-NEXT: s_lshr_b32 s2, s3, 8 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -439,23 +439,23 @@ ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc ; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_udiv24_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s0, s[0:1], 0xe -; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_load_dword s4, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_mov_b32 s1, s5 -; GCN-IR-NEXT: s_lshr_b32 s0, s0, 8 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GCN-IR-NEXT: s_lshr_b32 s0, s7, 8 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_lshr_b32 s2, s4, 8 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-IR-NEXT: s_lshr_b32 s2, s3, 8 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -464,7 +464,7 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc ; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %1 = lshr i64 %x, 40 %2 = lshr i64 %y, 40 @@ -518,17 +518,16 @@ define amdgpu_kernel void @s_test_udiv32_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_udiv32_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s2, s[0:1], 0xe +; GCN-NEXT: s_load_dword s4, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s3 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -541,17 +540,16 @@ ; ; GCN-IR-LABEL: s_test_udiv32_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dword s4, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s3 ; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -571,18 +569,18 @@ define amdgpu_kernel void @s_test_udiv31_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_udiv31_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xe -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_lshr_b32 s0, s0, 1 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GCN-NEXT: s_lshr_b32 s0, s7, 1 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_lshr_b32 s2, s4, 1 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-NEXT: s_lshr_b32 s2, s3, 1 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -591,23 +589,23 @@ ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc ; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_udiv31_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s0, s[0:1], 0xe -; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_load_dword s4, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_mov_b32 s1, s5 -; GCN-IR-NEXT: s_lshr_b32 s0, s0, 1 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GCN-IR-NEXT: s_lshr_b32 s0, s7, 1 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_lshr_b32 s2, s4, 1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-IR-NEXT: s_lshr_b32 s2, s3, 1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -616,7 +614,7 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc ; GCN-IR-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %1 = lshr i64 %x, 33 %2 = lshr i64 %y, 33 @@ -628,18 +626,18 @@ define amdgpu_kernel void @s_test_udiv23_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_udiv23_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xe -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_lshr_b32 s0, s0, 9 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GCN-NEXT: s_lshr_b32 s0, s7, 9 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_lshr_b32 s2, s4, 9 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-NEXT: s_lshr_b32 s2, s3, 9 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -648,23 +646,23 @@ ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc ; GCN-NEXT: v_and_b32_e32 v0, 0x7fffff, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_udiv23_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s0, s[0:1], 0xe -; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_load_dword s4, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_mov_b32 s1, s5 -; GCN-IR-NEXT: s_lshr_b32 s0, s0, 9 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GCN-IR-NEXT: s_lshr_b32 s0, s7, 9 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_lshr_b32 s2, s4, 9 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-IR-NEXT: s_lshr_b32 s2, s3, 9 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -673,7 +671,7 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc ; GCN-IR-NEXT: v_and_b32_e32 v0, 0x7fffff, v0 -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %1 = lshr i64 %x, 41 %2 = lshr i64 %y, 41 @@ -685,35 +683,35 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48 %y) { ; GCN-LABEL: s_test_udiv24_i48: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s2, s[0:1], 0xd -; GCN-NEXT: s_load_dword s3, s[0:1], 0xe -; GCN-NEXT: s_mov_b32 s7, 0xff000000 +; GCN-NEXT: s_load_dword s2, s[0:1], 0xe +; GCN-NEXT: s_load_dword s4, s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s6, 0xffff +; GCN-NEXT: s_mov_b32 s7, 0xff000000 ; GCN-NEXT: v_cvt_f32_ubyte3_e32 v2, s6 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s2, s2, s7 -; GCN-NEXT: s_and_b32 s3, s3, s6 +; GCN-NEXT: s_and_b32 s3, s2, s6 +; GCN-NEXT: s_and_b32 s2, s4, s7 ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_alignbit_b32 v0, s3, v0, 24 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v0 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: s_load_dword s8, s[0:1], 0xb -; GCN-NEXT: s_load_dword s0, s[0:1], 0xc -; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: s_load_dword s9, s[0:1], 0xc +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_lshr_b64 s[0:1], s[2:3], 24 ; GCN-NEXT: v_mac_f32_e32 v1, 0x4f800000, v2 ; GCN-NEXT: v_rcp_f32_e32 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s6, s0, s6 +; GCN-NEXT: s_and_b32 s6, s9, s6 ; GCN-NEXT: s_and_b32 s8, s8, s7 -; GCN-NEXT: s_lshr_b64 s[0:1], s[2:3], 24 +; GCN-NEXT: s_sub_u32 s0, 0, s0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 ; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GCN-NEXT: s_sub_u32 s0, 0, s0 ; GCN-NEXT: s_subb_u32 s1, 0, s1 +; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: v_mul_hi_u32 v3, s0, v1 ; GCN-NEXT: v_mul_lo_u32 v4, s0, v2 ; GCN-NEXT: v_mul_lo_u32 v5, s1, v1 @@ -804,18 +802,18 @@ ; ; GCN-IR-LABEL: s_test_udiv24_i48: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases +; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xc ; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xb -; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xc +; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xb ; GCN-IR-NEXT: s_load_dword s6, s[0:1], 0xd ; GCN-IR-NEXT: s_load_dword s7, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s8, 0xffff -; GCN-IR-NEXT: s_mov_b32 s9, 0xff000000 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_and_b32 s1, s3, s8 -; GCN-IR-NEXT: s_and_b32 s0, s2, s9 +; GCN-IR-NEXT: s_and_b32 s1, s2, s8 +; GCN-IR-NEXT: s_mov_b32 s2, 0xff000000 +; GCN-IR-NEXT: s_and_b32 s0, s3, s2 ; GCN-IR-NEXT: s_and_b32 s3, s7, s8 -; GCN-IR-NEXT: s_and_b32 s2, s6, s9 +; GCN-IR-NEXT: s_and_b32 s2, s6, s2 ; GCN-IR-NEXT: s_lshr_b64 s[6:7], s[0:1], 24 ; GCN-IR-NEXT: s_lshr_b64 s[2:3], s[2:3], 24 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[2:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll --- a/llvm/test/CodeGen/AMDGPU/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll @@ -39,7 +39,6 @@ ; GFX6-NEXT: s_load_dword s2, s[0:1], 0x26 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x13 -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x1d ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -51,12 +50,14 @@ ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX6-NEXT: s_load_dword s3, s[0:1], 0x1d ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, v0, s2 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v1 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s3, v1 ; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v1 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v1 @@ -75,6 +76,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s6, s[0:1], 0x98 ; GFX8-NEXT: s_load_dword s7, s[0:1], 0x74 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX8-NEXT: s_sub_i32 s2, 0, s6 @@ -83,7 +85,6 @@ ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s2, v0 ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; GFX8-NEXT: v_mul_hi_u32 v2, s7, v0 diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll --- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll @@ -235,27 +235,27 @@ ; ; GFX8-LABEL: s_uint_to_fp_v2i64_to_v2f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_flbit_i32_b32 s6, s3 -; GFX8-NEXT: s_flbit_i32_b32 s7, s1 -; GFX8-NEXT: s_min_u32 s6, s6, 32 -; GFX8-NEXT: s_min_u32 s7, s7, 32 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s6 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s7 +; GFX8-NEXT: s_flbit_i32_b32 s2, s7 +; GFX8-NEXT: s_flbit_i32_b32 s3, s5 +; GFX8-NEXT: s_min_u32 s8, s2, 32 +; GFX8-NEXT: s_min_u32 s9, s3, 32 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s8 ; GFX8-NEXT: s_min_u32 s2, s2, 1 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], s9 ; GFX8-NEXT: s_or_b32 s2, s3, s2 -; GFX8-NEXT: s_min_u32 s0, s0, 1 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX8-NEXT: s_or_b32 s0, s1, s0 -; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s0 -; GFX8-NEXT: s_sub_i32 s0, 32, s6 -; GFX8-NEXT: v_ldexp_f32 v1, v0, s0 -; GFX8-NEXT: s_sub_i32 s0, 32, s7 -; GFX8-NEXT: v_ldexp_f32 v0, v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: s_min_u32 s2, s4, 1 +; GFX8-NEXT: s_or_b32 s2, s5, s2 +; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s2 +; GFX8-NEXT: s_sub_i32 s2, 32, s8 +; GFX8-NEXT: v_ldexp_f32 v1, v0, s2 +; GFX8-NEXT: s_sub_i32 s2, 32, s9 +; GFX8-NEXT: v_ldexp_f32 v0, v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm %result = uitofp <2 x i64> %in to <2 x float> @@ -324,47 +324,47 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v1 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 16, v5 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v6, vcc -; GFX8-NEXT: flat_load_dwordx4 v[1:4], v[1:2] +; GFX8-NEXT: flat_load_dwordx4 v[1:4], v[5:6] +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 16, v5 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; GFX8-NEXT: flat_load_dwordx4 v[5:8], v[5:6] ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v10, s1 ; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_ffbh_u32_e32 v12, v4 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_ffbh_u32_e32 v0, v8 -; GFX8-NEXT: v_ffbh_u32_e32 v11, v6 -; GFX8-NEXT: v_ffbh_u32_e32 v13, v2 +; GFX8-NEXT: v_ffbh_u32_e32 v0, v4 +; GFX8-NEXT: v_ffbh_u32_e32 v11, v2 ; GFX8-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX8-NEXT: v_min_u32_e32 v11, 32, v11 +; GFX8-NEXT: v_lshlrev_b64 v[3:4], v0, v[3:4] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_ffbh_u32_e32 v12, v8 +; GFX8-NEXT: v_ffbh_u32_e32 v13, v6 ; GFX8-NEXT: v_min_u32_e32 v12, 32, v12 ; GFX8-NEXT: v_min_u32_e32 v13, 32, v13 -; GFX8-NEXT: v_lshlrev_b64 v[7:8], v0, v[7:8] ; GFX8-NEXT: v_sub_u32_e32 v14, vcc, 32, v0 -; GFX8-NEXT: v_lshlrev_b64 v[5:6], v11, v[5:6] -; GFX8-NEXT: v_lshlrev_b64 v[3:4], v12, v[3:4] -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v13, v[1:2] -; GFX8-NEXT: v_min_u32_e32 v7, 1, v7 -; GFX8-NEXT: v_min_u32_e32 v5, 1, v5 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v11, v[1:2] +; GFX8-NEXT: v_lshlrev_b64 v[7:8], v12, v[7:8] +; GFX8-NEXT: v_lshlrev_b64 v[5:6], v13, v[5:6] ; GFX8-NEXT: v_min_u32_e32 v3, 1, v3 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX8-NEXT: v_or_b32_e32 v7, v8, v7 -; GFX8-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX8-NEXT: v_min_u32_e32 v7, 1, v7 +; GFX8-NEXT: v_min_u32_e32 v5, 1, v5 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v7 -; GFX8-NEXT: v_cvt_f32_u32_e32 v4, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v8, v7 +; GFX8-NEXT: v_or_b32_e32 v4, v6, v5 ; GFX8-NEXT: v_cvt_f32_u32_e32 v3, v3 -; GFX8-NEXT: v_cvt_f32_u32_e32 v5, v0 -; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 32, v11 -; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 32, v12 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v13 -; GFX8-NEXT: v_ldexp_f32 v1, v1, v14 -; GFX8-NEXT: v_ldexp_f32 v0, v4, v11 -; GFX8-NEXT: v_ldexp_f32 v3, v3, v12 -; GFX8-NEXT: v_ldexp_f32 v2, v5, v2 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX8-NEXT: v_cvt_f32_u32_e32 v5, v1 +; GFX8-NEXT: v_cvt_f32_u32_e32 v4, v4 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v11 +; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 32, v12 +; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 32, v13 +; GFX8-NEXT: v_ldexp_f32 v1, v3, v14 +; GFX8-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX8-NEXT: v_ldexp_f32 v3, v5, v11 +; GFX8-NEXT: v_ldexp_f32 v2, v4, v12 ; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[0:3] ; GFX8-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -418,11 +418,11 @@ ; GFX8-NEXT: s_min_u32 s9, s3, 32 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s8 ; GFX8-NEXT: s_min_u32 s2, s2, 1 -; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], s9 ; GFX8-NEXT: s_or_b32 s2, s3, s2 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX8-NEXT: s_min_u32 s2, s4, 1 -; GFX8-NEXT: s_or_b32 s2, s5, s2 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], s9 +; GFX8-NEXT: s_min_u32 s2, s2, 1 +; GFX8-NEXT: s_or_b32 s2, s3, s2 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s2 ; GFX8-NEXT: s_sub_i32 s6, 32, s8 ; GFX8-NEXT: s_sub_i32 s2, 32, s9 @@ -509,53 +509,53 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v1 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 16, v5 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v6, vcc -; GFX8-NEXT: flat_load_dwordx4 v[1:4], v[1:2] +; GFX8-NEXT: flat_load_dwordx4 v[1:4], v[5:6] +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 16, v5 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; GFX8-NEXT: flat_load_dwordx4 v[5:8], v[5:6] ; GFX8-NEXT: v_mov_b32_e32 v10, s1 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_ffbh_u32_e32 v12, v4 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_ffbh_u32_e32 v0, v8 -; GFX8-NEXT: v_ffbh_u32_e32 v11, v6 -; GFX8-NEXT: v_ffbh_u32_e32 v13, v2 +; GFX8-NEXT: v_ffbh_u32_e32 v0, v4 +; GFX8-NEXT: v_ffbh_u32_e32 v11, v2 ; GFX8-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX8-NEXT: v_min_u32_e32 v11, 32, v11 +; GFX8-NEXT: v_lshlrev_b64 v[3:4], v0, v[3:4] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_ffbh_u32_e32 v12, v8 +; GFX8-NEXT: v_ffbh_u32_e32 v13, v6 ; GFX8-NEXT: v_min_u32_e32 v12, 32, v12 ; GFX8-NEXT: v_min_u32_e32 v13, 32, v13 -; GFX8-NEXT: v_lshlrev_b64 v[7:8], v0, v[7:8] ; GFX8-NEXT: v_sub_u32_e32 v14, vcc, 32, v0 -; GFX8-NEXT: v_lshlrev_b64 v[5:6], v11, v[5:6] -; GFX8-NEXT: v_lshlrev_b64 v[3:4], v12, v[3:4] -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v13, v[1:2] -; GFX8-NEXT: v_min_u32_e32 v7, 1, v7 -; GFX8-NEXT: v_min_u32_e32 v5, 1, v5 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v11, v[1:2] +; GFX8-NEXT: v_lshlrev_b64 v[7:8], v12, v[7:8] +; GFX8-NEXT: v_lshlrev_b64 v[5:6], v13, v[5:6] ; GFX8-NEXT: v_min_u32_e32 v3, 1, v3 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX8-NEXT: v_or_b32_e32 v7, v8, v7 -; GFX8-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX8-NEXT: v_min_u32_e32 v7, 1, v7 +; GFX8-NEXT: v_min_u32_e32 v5, 1, v5 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v7 -; GFX8-NEXT: v_cvt_f32_u32_e32 v4, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v8, v7 +; GFX8-NEXT: v_or_b32_e32 v4, v6, v5 ; GFX8-NEXT: v_cvt_f32_u32_e32 v3, v3 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 32, v11 -; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 32, v12 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v13 -; GFX8-NEXT: v_ldexp_f32 v1, v1, v14 -; GFX8-NEXT: v_ldexp_f32 v4, v4, v11 -; GFX8-NEXT: v_ldexp_f32 v3, v3, v12 +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX8-NEXT: v_cvt_f32_u32_e32 v4, v4 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v11 +; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 32, v12 +; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 32, v13 +; GFX8-NEXT: v_ldexp_f32 v3, v3, v14 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX8-NEXT: v_cvt_f16_f32_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; GFX8-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX8-NEXT: v_ldexp_f32 v1, v1, v11 +; GFX8-NEXT: v_ldexp_f32 v2, v4, v12 ; GFX8-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; GFX8-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX8-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GFX8-NEXT: v_cvt_f16_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GFX8-NEXT: v_cvt_f16_f32_e32 v6, v2 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v9 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v10, vcc -; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v6, v5 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll --- a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll @@ -201,7 +201,7 @@ ; GCN: s_branch [[ENDIF_LABEL:.L[0-9_A-Za-z]+]] ; GCN: [[IF_LABEL]]: -; GCN-NEXT: v_mov_b32_e32 [[IMM_REG]], 1 +; GCN: v_mov_b32_e32 [[IMM_REG]], 1 ; GCN-NEXT: [[ENDIF_LABEL]]: ; GCN: buffer_store_dword [[IMM_REG]] diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -123,19 +123,19 @@ ; ; GCN-IR-LABEL: s_test_urem_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GCN-IR-NEXT: s_mov_b64 s[2:3], 0 +; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[6:7], 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[0:1], 0 -; GCN-IR-NEXT: s_flbit_i32_b32 s12, s0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[4:5], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[2:3], 0 +; GCN-IR-NEXT: s_flbit_i32_b32 s12, s4 ; GCN-IR-NEXT: s_or_b64 s[14:15], s[8:9], s[10:11] -; GCN-IR-NEXT: s_flbit_i32_b32 s10, s6 +; GCN-IR-NEXT: s_flbit_i32_b32 s10, s2 ; GCN-IR-NEXT: s_add_i32 s12, s12, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s8, s1 +; GCN-IR-NEXT: s_flbit_i32_b32 s8, s5 ; GCN-IR-NEXT: s_add_i32 s10, s10, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s11, s7 +; GCN-IR-NEXT: s_flbit_i32_b32 s11, s3 ; GCN-IR-NEXT: s_min_u32 s8, s12, s8 ; GCN-IR-NEXT: s_min_u32 s12, s10, s11 ; GCN-IR-NEXT: s_sub_u32 s10, s8, s12 @@ -156,31 +156,31 @@ ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[14:15], v[0:1] ; GCN-IR-NEXT: s_sub_i32 s10, 63, s10 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, vcc -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[6:7], s10 +; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[6:7], s14 -; GCN-IR-NEXT: s_add_u32 s16, s0, -1 -; GCN-IR-NEXT: s_addc_u32 s17, s1, -1 -; GCN-IR-NEXT: s_not_b64 s[2:3], s[8:9] +; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[2:3], s14 +; GCN-IR-NEXT: s_add_u32 s16, s4, -1 +; GCN-IR-NEXT: s_addc_u32 s17, s5, -1 +; GCN-IR-NEXT: s_not_b64 s[6:7], s[8:9] ; GCN-IR-NEXT: s_mov_b32 s13, s9 -; GCN-IR-NEXT: s_add_u32 s8, s2, s12 -; GCN-IR-NEXT: s_addc_u32 s9, s3, s9 +; GCN-IR-NEXT: s_add_u32 s8, s6, s12 +; GCN-IR-NEXT: s_addc_u32 s9, s7, s9 ; GCN-IR-NEXT: s_mov_b64 s[12:13], 0 -; GCN-IR-NEXT: s_mov_b32 s3, 0 +; GCN-IR-NEXT: s_mov_b32 s7, 0 ; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1 -; GCN-IR-NEXT: s_lshr_b32 s2, s11, 31 +; GCN-IR-NEXT: s_lshr_b32 s6, s11, 31 ; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 -; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[2:3] +; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[6:7] ; GCN-IR-NEXT: s_or_b64 s[10:11], s[12:13], s[10:11] -; GCN-IR-NEXT: s_sub_u32 s2, s16, s14 -; GCN-IR-NEXT: s_subb_u32 s2, s17, s15 -; GCN-IR-NEXT: s_ashr_i32 s12, s2, 31 +; GCN-IR-NEXT: s_sub_u32 s6, s16, s14 +; GCN-IR-NEXT: s_subb_u32 s6, s17, s15 +; GCN-IR-NEXT: s_ashr_i32 s12, s6, 31 ; GCN-IR-NEXT: s_mov_b32 s13, s12 -; GCN-IR-NEXT: s_and_b32 s2, s12, 1 -; GCN-IR-NEXT: s_and_b64 s[18:19], s[12:13], s[0:1] +; GCN-IR-NEXT: s_and_b32 s6, s12, 1 +; GCN-IR-NEXT: s_and_b64 s[18:19], s[12:13], s[4:5] ; GCN-IR-NEXT: s_sub_u32 s14, s14, s18 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: s_subb_u32 s15, s15, s19 @@ -188,33 +188,33 @@ ; GCN-IR-NEXT: s_add_u32 s8, s8, 1 ; GCN-IR-NEXT: s_addc_u32 s9, s9, 0 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] -; GCN-IR-NEXT: s_mov_b64 s[12:13], s[2:3] +; GCN-IR-NEXT: s_mov_b64 s[12:13], s[6:7] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 ; GCN-IR-NEXT: .LBB0_4: ; %Flow6 ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[10:11], 1 -; GCN-IR-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s3 +; GCN-IR-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 ; GCN-IR-NEXT: s_branch .LBB0_6 ; GCN-IR-NEXT: .LBB0_5: -; GCN-IR-NEXT: v_mov_b32_e32 v0, s7 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s3 ; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[14:15] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 ; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[14:15] ; GCN-IR-NEXT: .LBB0_6: ; %udiv-end -; GCN-IR-NEXT: v_mul_lo_u32 v1, s0, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v2, s0, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v3, s1, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v0, s0, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s4, v1 +; GCN-IR-NEXT: v_mul_hi_u32 v2, s4, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v3, s5, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v0, s4, v0 ; GCN-IR-NEXT: s_mov_b32 s11, 0xf000 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s7 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-IR-NEXT: s_mov_b32 s10, -1 -; GCN-IR-NEXT: s_mov_b32 s8, s4 -; GCN-IR-NEXT: s_mov_b32 s9, s5 +; GCN-IR-NEXT: s_mov_b32 s8, s0 +; GCN-IR-NEXT: s_mov_b32 s9, s1 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GCN-IR-NEXT: s_endpgm @@ -434,17 +434,16 @@ define amdgpu_kernel void @s_test_urem31_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_urem31_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s2, s[0:1], 0xe -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s4, s2, 1 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xe ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_lshr_b32 s4, s4, 1 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GCN-NEXT: s_lshr_b32 s5, s3, 1 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s5 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -460,17 +459,16 @@ ; ; GCN-IR-LABEL: s_test_urem31_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xe -; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_lshr_b32 s4, s2, 1 +; GCN-IR-NEXT: s_load_dword s4, s[0:1], 0xe ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_lshr_b32 s4, s4, 1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GCN-IR-NEXT: s_lshr_b32 s5, s3, 1 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s5 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -493,21 +491,21 @@ define amdgpu_kernel void @s_test_urem31_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { ; GCN-LABEL: s_test_urem31_v2i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 +; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s10, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s2, s9, 1 -; GCN-NEXT: s_lshr_b32 s0, s1, 1 +; GCN-NEXT: s_lshr_b32 s0, s5, 1 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GCN-NEXT: s_lshr_b32 s3, s3, 1 -; GCN-NEXT: v_cvt_f32_u32_e32 v4, s3 +; GCN-NEXT: s_lshr_b32 s1, s1, 1 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s1 +; GCN-NEXT: s_lshr_b32 s2, s3, 1 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_lshr_b32 s1, s11, 1 -; GCN-NEXT: v_cvt_f32_u32_e32 v3, s1 +; GCN-NEXT: s_lshr_b32 s3, s7, 1 +; GCN-NEXT: v_cvt_f32_u32_e32 v4, s3 +; GCN-NEXT: v_cvt_f32_u32_e32 v3, s2 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v5, v2 @@ -519,7 +517,7 @@ ; GCN-NEXT: v_mul_f32_e32 v2, v3, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v5, v2 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s1, v0 ; GCN-NEXT: v_mad_f32 v2, -v2, v4, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc @@ -527,29 +525,29 @@ ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_brev_b32 s0, -2 ; GCN-NEXT: v_and_b32_e32 v0, s0, v0 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 ; GCN-NEXT: v_and_b32_e32 v2, s0, v2 ; GCN-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_urem31_v2i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 +; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GCN-IR-NEXT: s_mov_b32 s11, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s10, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_lshr_b32 s2, s9, 1 -; GCN-IR-NEXT: s_lshr_b32 s0, s1, 1 +; GCN-IR-NEXT: s_lshr_b32 s0, s5, 1 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GCN-IR-NEXT: s_lshr_b32 s3, s3, 1 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v4, s3 +; GCN-IR-NEXT: s_lshr_b32 s1, s1, 1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s1 +; GCN-IR-NEXT: s_lshr_b32 s2, s3, 1 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_lshr_b32 s1, s11, 1 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v3, s1 +; GCN-IR-NEXT: s_lshr_b32 s3, s7, 1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v4, s3 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v3, s2 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v5, v2 @@ -561,7 +559,7 @@ ; GCN-IR-NEXT: v_mul_f32_e32 v2, v3, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v5, v2 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s1, v0 ; GCN-IR-NEXT: v_mad_f32 v2, -v2, v4, v3 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 ; GCN-IR-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc @@ -569,10 +567,10 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: s_brev_b32 s0, -2 ; GCN-IR-NEXT: v_and_b32_e32 v0, s0, v0 -; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 ; GCN-IR-NEXT: v_and_b32_e32 v2, s0, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v3, v1 -; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-IR-NEXT: s_endpgm %1 = lshr <2 x i64> %x, %2 = lshr <2 x i64> %y, @@ -584,17 +582,16 @@ define amdgpu_kernel void @s_test_urem24_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_urem24_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s2, s[0:1], 0xe -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s4, s2, 8 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xe ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_lshr_b32 s4, s4, 8 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GCN-NEXT: s_lshr_b32 s5, s3, 8 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s5 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -610,17 +607,16 @@ ; ; GCN-IR-LABEL: s_test_urem24_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xe -; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_lshr_b32 s4, s2, 8 +; GCN-IR-NEXT: s_load_dword s4, s[0:1], 0xe ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_lshr_b32 s4, s4, 8 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GCN-IR-NEXT: s_lshr_b32 s5, s3, 8 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s5 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -643,21 +639,21 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { ; GCN-LABEL: s_test_urem23_64_v2i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 +; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s10, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s2, s9, 1 -; GCN-NEXT: s_lshr_b32 s0, s1, 1 +; GCN-NEXT: s_lshr_b32 s0, s5, 1 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GCN-NEXT: s_lshr_b32 s3, s3, 9 -; GCN-NEXT: v_cvt_f32_u32_e32 v4, s3 +; GCN-NEXT: s_lshr_b32 s1, s1, 1 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s1 +; GCN-NEXT: s_lshr_b32 s2, s3, 9 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_lshr_b32 s1, s11, 9 -; GCN-NEXT: v_cvt_f32_u32_e32 v3, s1 +; GCN-NEXT: s_lshr_b32 s3, s7, 9 +; GCN-NEXT: v_cvt_f32_u32_e32 v4, s3 +; GCN-NEXT: v_cvt_f32_u32_e32 v3, s2 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v5, v2 @@ -669,7 +665,7 @@ ; GCN-NEXT: v_mul_f32_e32 v2, v3, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v5, v2 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s1, v0 ; GCN-NEXT: v_mad_f32 v2, -v2, v4, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc @@ -677,29 +673,29 @@ ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_brev_b32 s0, -2 ; GCN-NEXT: v_and_b32_e32 v0, s0, v0 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 ; GCN-NEXT: v_and_b32_e32 v2, s0, v2 ; GCN-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_urem23_64_v2i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 +; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GCN-IR-NEXT: s_mov_b32 s11, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s10, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_lshr_b32 s2, s9, 1 -; GCN-IR-NEXT: s_lshr_b32 s0, s1, 1 +; GCN-IR-NEXT: s_lshr_b32 s0, s5, 1 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GCN-IR-NEXT: s_lshr_b32 s3, s3, 9 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v4, s3 +; GCN-IR-NEXT: s_lshr_b32 s1, s1, 1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s1 +; GCN-IR-NEXT: s_lshr_b32 s2, s3, 9 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_lshr_b32 s1, s11, 9 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v3, s1 +; GCN-IR-NEXT: s_lshr_b32 s3, s7, 9 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v4, s3 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v3, s2 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v5, v2 @@ -711,7 +707,7 @@ ; GCN-IR-NEXT: v_mul_f32_e32 v2, v3, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v5, v2 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s1, v0 ; GCN-IR-NEXT: v_mad_f32 v2, -v2, v4, v3 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 ; GCN-IR-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc @@ -719,10 +715,10 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: s_brev_b32 s0, -2 ; GCN-IR-NEXT: v_and_b32_e32 v0, s0, v0 -; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 ; GCN-IR-NEXT: v_and_b32_e32 v2, s0, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v3, v1 -; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-IR-NEXT: s_endpgm %1 = lshr <2 x i64> %x, %2 = lshr <2 x i64> %y, diff --git a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll --- a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll +++ b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll @@ -40,7 +40,7 @@ } ; GCN-LABEL: {{^}}test_use_s_v_s: -; GCN-DAG: s_load_dwordx2 s{{\[}}[[SA:[0-9]+]]:[[SB:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; SI: s_load_dwordx2 s{{\[}}[[SA:[0-9]+]]:[[SB:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} ; SI: buffer_load_dword [[VA0:v[0-9]+]] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_dword [[VA1:v[0-9]+]] @@ -52,6 +52,7 @@ ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_load_dword [[VA1:v[0-9]+]] ; VI-NEXT: s_waitcnt vmcnt(0) +; VI: s_load_dwordx2 s{{\[}}[[SA:[0-9]+]]:[[SB:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} ; GCN-NOT: v_mov_b32 ; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], s[[SB]] diff --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll --- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll @@ -9,22 +9,22 @@ ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; SI-NEXT: buffer_load_ushort v1, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_madak_f32 v0, v1, v0, 0x41200000 +; SI-NEXT: v_madak_f32 v0, v0, v1, 0x41200000 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -35,17 +35,17 @@ ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 ; VI-NEXT: s_mov_b32 s11, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_madak_f16 v0, v0, v1, 0x4900 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll b/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll --- a/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll @@ -115,15 +115,14 @@ ; GCN-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b64 s[0:1], s[6:7] -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 2, v0 -; GCN-NEXT: v_mov_b32_e32 v5, v2 -; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 addr64 glc +; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: buffer_load_dwordx4 v[1:4], v[4:5], s[0:3], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_mov_b64 s[6:7], s[2:3] +; GCN-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: buffer_store_dword v0, v[4:5], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -27,14 +27,12 @@ ; GFX9-LABEL: shuffle_v4f16_234u: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 -; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 -; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4f16_234u: @@ -729,13 +727,13 @@ ; GFX9-LABEL: shuffle_v4f16_3456: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v5, 16, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -764,11 +762,12 @@ ; GFX9-LABEL: shuffle_v4f16_5634: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 @@ -798,13 +797,13 @@ ; GFX9-LABEL: shuffle_v4f16_5734: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v5 ; GFX9-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 @@ -986,12 +985,12 @@ ; GFX9-LABEL: shuffle_v4f16_6161: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 -; GFX9-NEXT: global_load_dword v5, v[0:1], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v5 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -122,13 +122,13 @@ define amdgpu_kernel void @widen_i17_constant_load(i17 addrspace(4)* %arg) { ; SI-LABEL: widen_i17_constant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s1, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s7, s[6:7], 0x0 +; SI-NEXT: s_load_dword s7, s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s4, 2 ; SI-NEXT: s_mov_b32 s5, s0 ; SI-NEXT: s_mov_b32 s6, s2 @@ -207,13 +207,13 @@ define amdgpu_kernel void @widen_v2i8_constant_load(<2 x i8> addrspace(4)* %arg) { ; SI-LABEL: widen_v2i8_constant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s1, s[2:3], 0x0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dword s1, s[0:1], 0x0 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_and_b32 s4, s1, 0xff00 ; SI-NEXT: s_add_i32 s1, s1, 12 ; SI-NEXT: s_or_b32 s1, s1, 4 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -640,19 +640,21 @@ ; GFX9-O3-NEXT: s_mov_b64 s[36:37], s[30:31] ; GFX9-O3-NEXT: v_mov_b32_e32 v6, s8 ; GFX9-O3-NEXT: v_mov_b32_e32 v7, s9 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[30:31], -1 +; GFX9-O3-NEXT: s_getpc_b64 s[34:35] +; GFX9-O3-NEXT: s_add_u32 s34, s34, strict_wwm_called_i64@gotpcrel32@lo+4 +; GFX9-O3-NEXT: s_addc_u32 s35, s35, strict_wwm_called_i64@gotpcrel32@hi+12 +; GFX9-O3-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX9-O3-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: s_or_saveexec_b64 s[38:39], -1 -; GFX9-O3-NEXT: s_getpc_b64 s[30:31] -; GFX9-O3-NEXT: s_add_u32 s30, s30, strict_wwm_called_i64@gotpcrel32@lo+4 -; GFX9-O3-NEXT: s_addc_u32 s31, s31, strict_wwm_called_i64@gotpcrel32@hi+12 -; GFX9-O3-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7 ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-O3-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O3-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6